From e9636216496240e0e0174631de46d107562e9215 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 25 Apr 2024 06:37:35 +0530 Subject: [PATCH 01/56] [PixArt] fix small nits in pixart sigma (#7767) fix small nits in pixart sigma --- .../pixart_alpha/pipeline_pixart_alpha.py | 9 --------- .../pixart_alpha/pipeline_pixart_sigma.py | 15 +++------------ 2 files changed, 3 insertions(+), 21 deletions(-) diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py index c9efe594e398..22f0c507e5ee 100644 --- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py @@ -273,15 +273,6 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor) - # Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/utils.py - def mask_text_embeddings(self, emb, mask): - if emb.shape[0] == 1: - keep_index = mask.sum().item() - return emb[:, :, :keep_index, :], keep_index - else: - masked_feature = emb * mask[:, None, :, None] - return masked_feature, emb.shape[2] - # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt def encode_prompt( self, diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py index dadf41c176ea..e3dd3ddc0404 100644 --- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py @@ -199,16 +199,7 @@ def __init__( self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor) - # copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.py - def mask_text_embeddings(self, emb, mask): - if emb.shape[0] == 1: - keep_index = mask.sum().item() - return emb[:, :, :keep_index, :], keep_index - else: - masked_feature = emb * mask[:, None, :, None] - return masked_feature, emb.shape[2] - - # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt + # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.encode_prompt def encode_prompt( self, prompt: Union[str, List[str]], @@ -369,7 +360,7 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - # copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.py + # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.check_inputs def check_inputs( self, prompt, @@ -462,7 +453,7 @@ def process(text: str): return [process(t) for t in text] - # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline._clean_caption + # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption def _clean_caption(self, caption): caption = str(caption) caption = ul.unquote_plus(caption) From b833d0fc80900525a1e3b4df10422cc4283c4a55 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 25 Apr 2024 07:29:04 +0530 Subject: [PATCH 02/56] [Tests] mark UNetControlNetXSModelTests::test_forward_no_control to be flaky (#7771) decorate UNetControlNetXSModelTests::test_forward_no_control with is_flaky --- tests/models/unets/test_models_unet_controlnetxs.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/models/unets/test_models_unet_controlnetxs.py b/tests/models/unets/test_models_unet_controlnetxs.py index 8c9b43a20ad6..6f3662e01750 100644 --- a/tests/models/unets/test_models_unet_controlnetxs.py +++ b/tests/models/unets/test_models_unet_controlnetxs.py @@ -22,11 +22,7 @@ from diffusers import ControlNetXSAdapter, UNet2DConditionModel, UNetControlNetXSModel from diffusers.utils import logging -from diffusers.utils.testing_utils import ( - enable_full_determinism, - floats_tensor, - torch_device, -) +from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, is_flaky, torch_device from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin @@ -305,6 +301,7 @@ def _set_gradient_checkpointing_new(self, module, value=False): assert set(modules_with_gc_enabled.keys()) == EXPECTED_SET assert all(modules_with_gc_enabled.values()), "All modules should be enabled" + @is_flaky def test_forward_no_control(self): unet = self.get_dummy_unet() controlnet = self.get_dummy_controlnet_from_unet(unet) From 142f353e1c638ff1d20bd798402b68f72c1ebbdd Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 25 Apr 2024 18:05:27 +0530 Subject: [PATCH 03/56] Fix lora device test (#7738) * fix lora device test * fix more. * fix more/ * quality * empty --------- Co-authored-by: Dhruv Nair --- src/diffusers/loaders/lora.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/diffusers/loaders/lora.py b/src/diffusers/loaders/lora.py index 5d89658830f1..8703cdee4011 100644 --- a/src/diffusers/loaders/lora.py +++ b/src/diffusers/loaders/lora.py @@ -1268,9 +1268,10 @@ def set_lora_device(self, adapter_names: List[str], device: Union[torch.device, unet_module.lora_A[adapter_name].to(device) unet_module.lora_B[adapter_name].to(device) # this is a param, not a module, so device placement is not in-place -> re-assign - unet_module.lora_magnitude_vector[adapter_name] = unet_module.lora_magnitude_vector[ - adapter_name - ].to(device) + if hasattr(unet_module, "lora_magnitude_vector") and unet_module.lora_magnitude_vector is not None: + unet_module.lora_magnitude_vector[adapter_name] = unet_module.lora_magnitude_vector[ + adapter_name + ].to(device) # Handle the text encoder modules_to_process = [] @@ -1288,9 +1289,13 @@ def set_lora_device(self, adapter_names: List[str], device: Union[torch.device, text_encoder_module.lora_A[adapter_name].to(device) text_encoder_module.lora_B[adapter_name].to(device) # this is a param, not a module, so device placement is not in-place -> re-assign - text_encoder_module.lora_magnitude_vector[ - adapter_name - ] = text_encoder_module.lora_magnitude_vector[adapter_name].to(device) + if ( + hasattr(text_encoder, "lora_magnitude_vector") + and text_encoder_module.lora_magnitude_vector is not None + ): + text_encoder_module.lora_magnitude_vector[ + adapter_name + ] = text_encoder_module.lora_magnitude_vector[adapter_name].to(device) class StableDiffusionXLLoraLoaderMixin(LoraLoaderMixin): From 181688012a2abadc93b316d91a513bee36193615 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Thu, 25 Apr 2024 16:15:12 -0700 Subject: [PATCH 04/56] [docs] Reproducible pipelines (#7769) * reproducibility * feedback * feedback * fix path * github link --- docs/source/en/_toctree.yml | 4 +- docs/source/en/api/utilities.md | 4 + docs/source/en/stable_diffusion.md | 2 +- .../en/using-diffusers/reproducibility.md | 191 ------------------ .../en/using-diffusers/reusing_seeds.md | 171 +++++++++++++--- docs/source/ja/stable_diffusion.md | 2 +- docs/source/ko/stable_diffusion.md | 2 +- docs/source/zh/stable_diffusion.md | 2 +- 8 files changed, 148 insertions(+), 230 deletions(-) delete mode 100644 docs/source/en/using-diffusers/reproducibility.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 140bdfad3a89..f098faa47e5a 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -62,7 +62,7 @@ - local: using-diffusers/callback title: Pipeline callbacks - local: using-diffusers/reusing_seeds - title: Improve image quality with deterministic generation + title: Reproducible pipelines - local: using-diffusers/control_brightness title: Control image brightness - local: using-diffusers/weighted_prompts @@ -89,8 +89,6 @@ title: Shap-E - local: using-diffusers/diffedit title: DiffEdit - - local: using-diffusers/reproducibility - title: Create reproducible pipelines - local: using-diffusers/custom_pipeline_examples title: Community pipelines - local: using-diffusers/contribute_pipeline diff --git a/docs/source/en/api/utilities.md b/docs/source/en/api/utilities.md index 71253db215ab..d4f4d7d7964f 100644 --- a/docs/source/en/api/utilities.md +++ b/docs/source/en/api/utilities.md @@ -37,3 +37,7 @@ Utility and helper functions for working with 🤗 Diffusers. ## make_image_grid [[autodoc]] utils.make_image_grid + +## randn_tensor + +[[autodoc]] utils.torch_utils.randn_tensor diff --git a/docs/source/en/stable_diffusion.md b/docs/source/en/stable_diffusion.md index 877a4ac70823..db4953ebbffd 100644 --- a/docs/source/en/stable_diffusion.md +++ b/docs/source/en/stable_diffusion.md @@ -49,7 +49,7 @@ One of the simplest ways to speed up inference is to place the pipeline on a GPU pipeline = pipeline.to("cuda") ``` -To make sure you can use the same image and improve on it, use a [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed for [reproducibility](./using-diffusers/reproducibility): +To make sure you can use the same image and improve on it, use a [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed for [reproducibility](./using-diffusers/reusing_seeds): ```python import torch diff --git a/docs/source/en/using-diffusers/reproducibility.md b/docs/source/en/using-diffusers/reproducibility.md deleted file mode 100644 index 7c61578d9239..000000000000 --- a/docs/source/en/using-diffusers/reproducibility.md +++ /dev/null @@ -1,191 +0,0 @@ - - -# Create reproducible pipelines - -[[open-in-colab]] - -Reproducibility is important for testing, replicating results, and can even be used to [improve image quality](reusing_seeds). However, the randomness in diffusion models is a desired property because it allows the pipeline to generate different images every time it is run. While you can't expect to get the exact same results across platforms, you can expect results to be reproducible across releases and platforms within a certain tolerance range. Even then, tolerance varies depending on the diffusion pipeline and checkpoint. - -This is why it's important to understand how to control sources of randomness in diffusion models or use deterministic algorithms. - - - -💡 We strongly recommend reading PyTorch's [statement about reproducibility](https://pytorch.org/docs/stable/notes/randomness.html): - -> Completely reproducible results are not guaranteed across PyTorch releases, individual commits, or different platforms. Furthermore, results may not be reproducible between CPU and GPU executions, even when using identical seeds. - - - -## Control randomness - -During inference, pipelines rely heavily on random sampling operations which include creating the -Gaussian noise tensors to denoise and adding noise to the scheduling step. - -Take a look at the tensor values in the [`DDIMPipeline`] after two inference steps: - -```python -from diffusers import DDIMPipeline -import numpy as np - -model_id = "google/ddpm-cifar10-32" - -# load model and scheduler -ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True) - -# run pipeline for just two steps and return numpy tensor -image = ddim(num_inference_steps=2, output_type="np").images -print(np.abs(image).sum()) -``` - -Running the code above prints one value, but if you run it again you get a different value. What is going on here? - -Every time the pipeline is run, [`torch.randn`](https://pytorch.org/docs/stable/generated/torch.randn.html) uses a different random seed to create Gaussian noise which is denoised stepwise. This leads to a different result each time it is run, which is great for diffusion pipelines since it generates a different random image each time. - -But if you need to reliably generate the same image, that'll depend on whether you're running the pipeline on a CPU or GPU. - -### CPU - -To generate reproducible results on a CPU, you'll need to use a PyTorch [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed: - -```python -import torch -from diffusers import DDIMPipeline -import numpy as np - -model_id = "google/ddpm-cifar10-32" - -# load model and scheduler -ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True) - -# create a generator for reproducibility -generator = torch.Generator(device="cpu").manual_seed(0) - -# run pipeline for just two steps and return numpy tensor -image = ddim(num_inference_steps=2, output_type="np", generator=generator).images -print(np.abs(image).sum()) -``` - -Now when you run the code above, it always prints a value of `1491.1711` no matter what because the `Generator` object with the seed is passed to all the random functions of the pipeline. - -If you run this code example on your specific hardware and PyTorch version, you should get a similar, if not the same, result. - - - -💡 It might be a bit unintuitive at first to pass `Generator` objects to the pipeline instead of -just integer values representing the seed, but this is the recommended design when dealing with -probabilistic models in PyTorch, as `Generator`s are *random states* that can be -passed to multiple pipelines in a sequence. - - - -### GPU - -Writing a reproducible pipeline on a GPU is a bit trickier, and full reproducibility across different hardware is not guaranteed because matrix multiplication - which diffusion pipelines require a lot of - is less deterministic on a GPU than a CPU. For example, if you run the same code example above on a GPU: - -```python -import torch -from diffusers import DDIMPipeline -import numpy as np - -model_id = "google/ddpm-cifar10-32" - -# load model and scheduler -ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True) -ddim.to("cuda") - -# create a generator for reproducibility -generator = torch.Generator(device="cuda").manual_seed(0) - -# run pipeline for just two steps and return numpy tensor -image = ddim(num_inference_steps=2, output_type="np", generator=generator).images -print(np.abs(image).sum()) -``` - -The result is not the same even though you're using an identical seed because the GPU uses a different random number generator than the CPU. - -To circumvent this problem, 🧨 Diffusers has a [`~diffusers.utils.torch_utils.randn_tensor`] function for creating random noise on the CPU, and then moving the tensor to a GPU if necessary. The `randn_tensor` function is used everywhere inside the pipeline, allowing the user to **always** pass a CPU `Generator` even if the pipeline is run on a GPU. - -You'll see the results are much closer now! - -```python -import torch -from diffusers import DDIMPipeline -import numpy as np - -model_id = "google/ddpm-cifar10-32" - -# load model and scheduler -ddim = DDIMPipeline.from_pretrained(model_id, use_safetensors=True) -ddim.to("cuda") - -# create a generator for reproducibility; notice you don't place it on the GPU! -generator = torch.manual_seed(0) - -# run pipeline for just two steps and return numpy tensor -image = ddim(num_inference_steps=2, output_type="np", generator=generator).images -print(np.abs(image).sum()) -``` - - - -💡 If reproducibility is important, we recommend always passing a CPU generator. -The performance loss is often neglectable, and you'll generate much more similar -values than if the pipeline had been run on a GPU. - - - -Finally, for more complex pipelines such as [`UnCLIPPipeline`], these are often extremely -susceptible to precision error propagation. Don't expect similar results across -different GPU hardware or PyTorch versions. In this case, you'll need to run -exactly the same hardware and PyTorch version for full reproducibility. - -## Deterministic algorithms - -You can also configure PyTorch to use deterministic algorithms to create a reproducible pipeline. However, you should be aware that deterministic algorithms may be slower than nondeterministic ones and you may observe a decrease in performance. But if reproducibility is important to you, then this is the way to go! - -Nondeterministic behavior occurs when operations are launched in more than one CUDA stream. To avoid this, set the environment variable [`CUBLAS_WORKSPACE_CONFIG`](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility) to `:16:8` to only use one buffer size during runtime. - -PyTorch typically benchmarks multiple algorithms to select the fastest one, but if you want reproducibility, you should disable this feature because the benchmark may select different algorithms each time. Lastly, pass `True` to [`torch.use_deterministic_algorithms`](https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html) to enable deterministic algorithms. - -```py -import os -import torch - -os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8" - -torch.backends.cudnn.benchmark = False -torch.use_deterministic_algorithms(True) -``` - -Now when you run the same pipeline twice, you'll get identical results. - -```py -import torch -from diffusers import DDIMScheduler, StableDiffusionPipeline - -model_id = "runwayml/stable-diffusion-v1-5" -pipe = StableDiffusionPipeline.from_pretrained(model_id, use_safetensors=True).to("cuda") -pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) -g = torch.Generator(device="cuda") - -prompt = "A bear is playing a guitar on Times Square" - -g.manual_seed(0) -result1 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images - -g.manual_seed(0) -result2 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images - -print("L_inf dist =", abs(result1 - result2).max()) -"L_inf dist = tensor(0., device='cuda:0')" -``` diff --git a/docs/source/en/using-diffusers/reusing_seeds.md b/docs/source/en/using-diffusers/reusing_seeds.md index bad567b55c52..989d0ba3eb07 100644 --- a/docs/source/en/using-diffusers/reusing_seeds.md +++ b/docs/source/en/using-diffusers/reusing_seeds.md @@ -10,72 +10,179 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# Improve image quality with deterministic generation +# Reproducible pipelines -[[open-in-colab]] +Diffusion models are inherently random which is what allows it to generate different outputs every time it is run. But there are certain times when you want to generate the same output every time, like when you're testing, replicating results, and even [improving image quality](#deterministic-batch-generation). While you can't expect to get identical results across platforms, you can expect reproducible results across releases and platforms within a certain tolerance range (though even this may vary). -A common way to improve the quality of generated images is with *deterministic batch generation*, generate a batch of images and select one image to improve with a more detailed prompt in a second round of inference. The key is to pass a list of [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html#generator)'s to the pipeline for batched image generation, and tie each `Generator` to a seed so you can reuse it for an image. +This guide will show you how to control randomness for deterministic generation on a CPU and GPU. -Let's use [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) for example, and generate several versions of the following prompt: +> [!TIP] +> We strongly recommend reading PyTorch's [statement about reproducibility](https://pytorch.org/docs/stable/notes/randomness.html): +> +> "Completely reproducible results are not guaranteed across PyTorch releases, individual commits, or different platforms. Furthermore, results may not be reproducible between CPU and GPU executions, even when using identical seeds." -```py -prompt = "Labrador in the style of Vermeer" +## Control randomness + +During inference, pipelines rely heavily on random sampling operations which include creating the +Gaussian noise tensors to denoise and adding noise to the scheduling step. + +Take a look at the tensor values in the [`DDIMPipeline`] after two inference steps. + +```python +from diffusers import DDIMPipeline +import numpy as np + +ddim = DDIMPipeline.from_pretrained( "google/ddpm-cifar10-32", use_safetensors=True) +image = ddim(num_inference_steps=2, output_type="np").images +print(np.abs(image).sum()) ``` -Instantiate a pipeline with [`DiffusionPipeline.from_pretrained`] and place it on a GPU (if available): +Running the code above prints one value, but if you run it again you get a different value. + +Each time the pipeline is run, [torch.randn](https://pytorch.org/docs/stable/generated/torch.randn.html) uses a different random seed to create the Gaussian noise tensors. This leads to a different result each time it is run and enables the diffusion pipeline to generate a different random image each time. + +But if you need to reliably generate the same image, that depends on whether you're running the pipeline on a CPU or GPU. + +> [!TIP] +> It might seem unintuitive to pass `Generator` objects to a pipeline instead of the integer value representing the seed. However, this is the recommended design when working with probabilistic models in PyTorch because a `Generator` is a *random state* that can be passed to multiple pipelines in a sequence. As soon as the `Generator` is consumed, the *state* is changed in place which means even if you passed the same `Generator` to a different pipeline, it won't produce the same result because the state is already changed. + + + + +To generate reproducible results on a CPU, you'll need to use a PyTorch [Generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed. Now when you run the code, it always prints a value of `1491.1711` because the `Generator` object with the seed is passed to all the random functions in the pipeline. You should get a similar, if not the same, result on whatever hardware and PyTorch version you're using. ```python import torch -from diffusers import DiffusionPipeline -from diffusers.utils import make_image_grid +import numpy as np +from diffusers import DDIMPipeline -pipe = DiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True -) -pipe = pipe.to("cuda") +ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True) +generator = torch.Generator(device="cpu").manual_seed(0) +image = ddim(num_inference_steps=2, output_type="np", generator=generator).images +print(np.abs(image).sum()) ``` -Now, define four different `Generator`s and assign each `Generator` a seed (`0` to `3`) so you can reuse a `Generator` later for a specific image: + + + +Writing a reproducible pipeline on a GPU is a bit trickier, and full reproducibility across different hardware is not guaranteed because matrix multiplication - which diffusion pipelines require a lot of - is less deterministic on a GPU than a CPU. For example, if you run the same code example from the CPU example, you'll get a different result even though the seed is identical. This is because the GPU uses a different random number generator than the CPU. ```python -generator = [torch.Generator(device="cuda").manual_seed(i) for i in range(4)] +import torch +import numpy as np +from diffusers import DDIMPipeline + +ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True) +ddim.to("cuda") +generator = torch.Generator(device="cuda").manual_seed(0) +image = ddim(num_inference_steps=2, output_type="np", generator=generator).images +print(np.abs(image).sum()) +``` + +To avoid this issue, Diffusers has a [`~utils.torch_utils.randn_tensor`] function for creating random noise on the CPU, and then moving the tensor to a GPU if necessary. The [`~utils.torch_utils.randn_tensor`] function is used everywhere inside the pipeline. Now you can call [torch.manual_seed](https://pytorch.org/docs/stable/generated/torch.manual_seed.html) which automatically creates a CPU `Generator` that can be passed to the pipeline even if it is being run on a GPU. + +```python +import torch +import numpy as np +from diffusers import DDIMPipeline + +ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True) +ddim.to("cuda") +generator = torch.manual_seed(0) +image = ddim(num_inference_steps=2, output_type="np", generator=generator).images +print(np.abs(image).sum()) ``` - +> [!TIP] +> If reproducibility is important to your use case, we recommend always passing a CPU `Generator`. The performance loss is often negligible and you'll generate more similar values than if the pipeline had been run on a GPU. + +Finally, more complex pipelines such as [`UnCLIPPipeline`], are often extremely +susceptible to precision error propagation. You'll need to use +exactly the same hardware and PyTorch version for full reproducibility. + + + + +## Deterministic algorithms + +You can also configure PyTorch to use deterministic algorithms to create a reproducible pipeline. The downside is that deterministic algorithms may be slower than non-deterministic ones and you may observe a decrease in performance. -To create a batched seed, you should use a list comprehension that iterates over the length specified in `range()`. This creates a unique `Generator` object for each image in the batch. If you only multiply the `Generator` by the batch size, this only creates one `Generator` object that is used sequentially for each image in the batch. +Non-deterministic behavior occurs when operations are launched in more than one CUDA stream. To avoid this, set the environment variable [CUBLAS_WORKSPACE_CONFIG](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility) to `:16:8` to only use one buffer size during runtime. -For example, if you want to use the same seed to create 4 identical images: +PyTorch typically benchmarks multiple algorithms to select the fastest one, but if you want reproducibility, you should disable this feature because the benchmark may select different algorithms each time. Set Diffusers [enable_full_determinism](https://github.com/huggingface/diffusers/blob/142f353e1c638ff1d20bd798402b68f72c1ebbdd/src/diffusers/utils/testing_utils.py#L861) to enable deterministic algorithms. ```py -❌ [torch.Generator().manual_seed(seed)] * 4 +enable_full_determinism() +``` + +Now when you run the same pipeline twice, you'll get identical results. + +```py +import torch +from diffusers import DDIMScheduler, StableDiffusionPipeline + +pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True).to("cuda") +pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) +g = torch.Generator(device="cuda") + +prompt = "A bear is playing a guitar on Times Square" -✅ [torch.Generator().manual_seed(seed) for _ in range(4)] +g.manual_seed(0) +result1 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images + +g.manual_seed(0) +result2 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images + +print("L_inf dist =", abs(result1 - result2).max()) +"L_inf dist = tensor(0., device='cuda:0')" ``` - +## Deterministic batch generation -Generate the images and have a look: +A practical application of creating reproducible pipelines is *deterministic batch generation*. You generate a batch of images and select one image to improve with a more detailed prompt. The main idea is to pass a list of [Generator's](https://pytorch.org/docs/stable/generated/torch.Generator.html) to the pipeline and tie each `Generator` to a seed so you can reuse it. -```python -images = pipe(prompt, generator=generator, num_images_per_prompt=4).images -make_image_grid(images, rows=2, cols=2) +Let's use the [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) checkpoint and generate a batch of images. + +```py +import torch +from diffusers import DiffusionPipeline +from diffusers.utils import make_image_grid + +pipeline = DiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True +) +pipeline = pipeline.to("cuda") ``` -![img](https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds.jpg) +Define four different `Generator`s and assign each `Generator` a seed (`0` to `3`). Then generate a batch of images and pick one to iterate on. -In this example, you'll improve upon the first image - but in reality, you can use any image you want (even the image with double sets of eyes!). The first image used the `Generator` with seed `0`, so you'll reuse that `Generator` for the second round of inference. To improve the quality of the image, add some additional text to the prompt: +> [!WARNING] +> Use a list comprehension that iterates over the batch size specified in `range()` to create a unique `Generator` object for each image in the batch. If you multiply the `Generator` by the batch size integer, it only creates *one* `Generator` object that is used sequentially for each image in the batch. +> +> ```py +> [torch.Generator().manual_seed(seed)] * 4 +> ``` ```python -prompt = [prompt + t for t in [", highly realistic", ", artsy", ", trending", ", colorful"]] -generator = [torch.Generator(device="cuda").manual_seed(0) for i in range(4)] +generator = [torch.Generator(device="cuda").manual_seed(i) for i in range(4)] +prompt = "Labrador in the style of Vermeer" +images = pipeline(prompt, generator=generator, num_images_per_prompt=4).images[0] +make_image_grid(images, rows=2, cols=2) ``` -Create four generators with seed `0`, and generate another batch of images, all of which should look like the first image from the previous round! +
+ +
+ +Let's improve the first image (you can choose any image you want) which corresponds to the `Generator` with seed `0`. Add some additional text to your prompt and then make sure you reuse the same `Generator` with seed `0`. All the generated images should resemble the first image. ```python -images = pipe(prompt, generator=generator).images +prompt = [prompt + t for t in [", highly realistic", ", artsy", ", trending", ", colorful"]] +generator = [torch.Generator(device="cuda").manual_seed(0) for i in range(4)] +images = pipeline(prompt, generator=generator).images make_image_grid(images, rows=2, cols=2) ``` -![img](https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/reusabe_seeds_2.jpg) +
+ +
diff --git a/docs/source/ja/stable_diffusion.md b/docs/source/ja/stable_diffusion.md index cae244dab2b6..b22178b75529 100644 --- a/docs/source/ja/stable_diffusion.md +++ b/docs/source/ja/stable_diffusion.md @@ -49,7 +49,7 @@ prompt = "portrait photo of a old warrior chief" pipeline = pipeline.to("cuda") ``` -同じイメージを使って改良できるようにするには、[`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)を使い、[reproducibility](./using-diffusers/reproducibility)の種を設定します: +同じイメージを使って改良できるようにするには、[`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)を使い、[reproducibility](./using-diffusers/reusing_seeds)の種を設定します: ```python import torch diff --git a/docs/source/ko/stable_diffusion.md b/docs/source/ko/stable_diffusion.md index 678db4ff9eae..342f17bbff31 100644 --- a/docs/source/ko/stable_diffusion.md +++ b/docs/source/ko/stable_diffusion.md @@ -49,7 +49,7 @@ prompt = "portrait photo of a old warrior chief" pipeline = pipeline.to("cuda") ``` -동일한 이미지를 사용하고 개선할 수 있는지 확인하려면 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)를 사용하고 [재현성](./using-diffusers/reproducibility)에 대한 시드를 설정하세요: +동일한 이미지를 사용하고 개선할 수 있는지 확인하려면 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html)를 사용하고 [재현성](./using-diffusers/reusing_seeds)에 대한 시드를 설정하세요: ```python import torch diff --git a/docs/source/zh/stable_diffusion.md b/docs/source/zh/stable_diffusion.md index 614d9505d7b8..d92cdf7d1163 100644 --- a/docs/source/zh/stable_diffusion.md +++ b/docs/source/zh/stable_diffusion.md @@ -51,7 +51,7 @@ prompt = "portrait photo of a old warrior chief" pipeline = pipeline.to("cuda") ``` -为了确保您可以使用相同的图像并对其进行改进,使用 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) 方法,然后设置一个随机数种子 以确保其 [复现性](./using-diffusers/reproducibility): +为了确保您可以使用相同的图像并对其进行改进,使用 [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) 方法,然后设置一个随机数种子 以确保其 [复现性](./using-diffusers/reusing_seeds): ```python import torch From fa750a15bd90f64c1fb53df3227cd0e247e2f1e8 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Thu, 25 Apr 2024 16:55:35 -0700 Subject: [PATCH 05/56] [docs] Refactor image quality docs (#7758) * refactor * code snippets * fix path * fix path in guide * code outputs * align toctree title * title * fix title --- docs/source/en/_toctree.yml | 6 +- docs/source/en/api/pipelines/overview.md | 5 + .../en/using-diffusers/control_brightness.md | 58 ------ docs/source/en/using-diffusers/freeu.md | 135 ------------- .../en/using-diffusers/image_quality.md | 190 ++++++++++++++++++ 5 files changed, 197 insertions(+), 197 deletions(-) delete mode 100644 docs/source/en/using-diffusers/control_brightness.md delete mode 100644 docs/source/en/using-diffusers/freeu.md create mode 100644 docs/source/en/using-diffusers/image_quality.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index f098faa47e5a..357afb2ea261 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -63,12 +63,10 @@ title: Pipeline callbacks - local: using-diffusers/reusing_seeds title: Reproducible pipelines - - local: using-diffusers/control_brightness - title: Control image brightness + - local: using-diffusers/image_quality + title: Controlling image quality - local: using-diffusers/weighted_prompts title: Prompt techniques - - local: using-diffusers/freeu - title: Improve generation quality with FreeU title: Inference techniques - sections: - local: using-diffusers/sdxl diff --git a/docs/source/en/api/pipelines/overview.md b/docs/source/en/api/pipelines/overview.md index cd1232a90d6e..e7b8bf4936c0 100644 --- a/docs/source/en/api/pipelines/overview.md +++ b/docs/source/en/api/pipelines/overview.md @@ -97,6 +97,11 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an - to - components + +[[autodoc]] pipelines.StableDiffusionMixin.enable_freeu + +[[autodoc]] pipelines.StableDiffusionMixin.disable_freeu + ## FlaxDiffusionPipeline [[autodoc]] pipelines.pipeline_flax_utils.FlaxDiffusionPipeline diff --git a/docs/source/en/using-diffusers/control_brightness.md b/docs/source/en/using-diffusers/control_brightness.md deleted file mode 100644 index 5fad664f60f3..000000000000 --- a/docs/source/en/using-diffusers/control_brightness.md +++ /dev/null @@ -1,58 +0,0 @@ - - -# Control image brightness - -The Stable Diffusion pipeline is mediocre at generating images that are either very bright or dark as explained in the [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) paper. The solutions proposed in the paper are currently implemented in the [`DDIMScheduler`] which you can use to improve the lighting in your images. - - - -💡 Take a look at the paper linked above for more details about the proposed solutions! - - - -One of the solutions is to train a model with *v prediction* and *v loss*. Add the following flag to the [`train_text_to_image.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [`train_text_to_image_lora.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts to enable `v_prediction`: - -```bash ---prediction_type="v_prediction" -``` - -For example, let's use the [`ptx0/pseudo-journey-v2`](https://huggingface.co/ptx0/pseudo-journey-v2) checkpoint which has been finetuned with `v_prediction`. - -Next, configure the following parameters in the [`DDIMScheduler`]: - -1. `rescale_betas_zero_snr=True`, rescales the noise schedule to zero terminal signal-to-noise ratio (SNR) -2. `timestep_spacing="trailing"`, starts sampling from the last timestep - -```py -from diffusers import DiffusionPipeline, DDIMScheduler - -pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True) - -# switch the scheduler in the pipeline to use the DDIMScheduler -pipeline.scheduler = DDIMScheduler.from_config( - pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing" -) -pipeline.to("cuda") -``` - -Finally, in your call to the pipeline, set `guidance_rescale` to prevent overexposure: - -```py -prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" -image = pipeline(prompt, guidance_rescale=0.7).images[0] -image -``` - -
- -
diff --git a/docs/source/en/using-diffusers/freeu.md b/docs/source/en/using-diffusers/freeu.md deleted file mode 100644 index 7b1fb908cac9..000000000000 --- a/docs/source/en/using-diffusers/freeu.md +++ /dev/null @@ -1,135 +0,0 @@ - - -# Improve generation quality with FreeU - -[[open-in-colab]] - -The UNet is responsible for denoising during the reverse diffusion process, and there are two distinct features in its architecture: - -1. Backbone features primarily contribute to the denoising process -2. Skip features mainly introduce high-frequency features into the decoder module and can make the network overlook the semantics in the backbone features - -However, the skip connection can sometimes introduce unnatural image details. [FreeU](https://hf.co/papers/2309.11497) is a technique for improving image quality by rebalancing the contributions from the UNet’s skip connections and backbone feature maps. - -FreeU is applied during inference and it does not require any additional training. The technique works for different tasks such as text-to-image, image-to-image, and text-to-video. - -In this guide, you will apply FreeU to the [`StableDiffusionPipeline`], [`StableDiffusionXLPipeline`], and [`TextToVideoSDPipeline`]. You need to install Diffusers from source to run the examples below. - -## StableDiffusionPipeline - -Load the pipeline: - -```py -from diffusers import DiffusionPipeline -import torch - -pipeline = DiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, safety_checker=None -).to("cuda") -``` - -Then enable the FreeU mechanism with the FreeU-specific hyperparameters. These values are scaling factors for the backbone and skip features. - -```py -pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4) -``` - -The values above are from the official FreeU [code repository](https://github.com/ChenyangSi/FreeU) where you can also find [reference hyperparameters](https://github.com/ChenyangSi/FreeU#range-for-more-parameters) for different models. - - - -Disable the FreeU mechanism by calling `disable_freeu()` on a pipeline. - - - -And then run inference: - -```py -prompt = "A squirrel eating a burger" -seed = 2023 -image = pipeline(prompt, generator=torch.manual_seed(seed)).images[0] -image -``` - -The figure below compares non-FreeU and FreeU results respectively for the same hyperparameters used above (`prompt` and `seed`): - -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/freeu/sdv1_5_freeu.jpg) - - -Let's see how Stable Diffusion 2 results are impacted: - -```py -from diffusers import DiffusionPipeline -import torch - -pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16, safety_checker=None -).to("cuda") - -prompt = "A squirrel eating a burger" -seed = 2023 - -pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.1, b2=1.2) -image = pipeline(prompt, generator=torch.manual_seed(seed)).images[0] -image -``` - -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/freeu/sdv2_1_freeu.jpg) - -## Stable Diffusion XL - -Finally, let's take a look at how FreeU affects Stable Diffusion XL results: - -```py -from diffusers import DiffusionPipeline -import torch - -pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, -).to("cuda") - -prompt = "A squirrel eating a burger" -seed = 2023 - -# Comes from -# https://wandb.ai/nasirk24/UNET-FreeU-SDXL/reports/FreeU-SDXL-Optimal-Parameters--Vmlldzo1NDg4NTUw -pipeline.enable_freeu(s1=0.6, s2=0.4, b1=1.1, b2=1.2) -image = pipeline(prompt, generator=torch.manual_seed(seed)).images[0] -image -``` - -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/freeu/sdxl_freeu.jpg) - -## Text-to-video generation - -FreeU can also be used to improve video quality: - -```python -from diffusers import DiffusionPipeline -from diffusers.utils import export_to_video -import torch - -model_id = "cerspense/zeroscope_v2_576w" -pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda") - -prompt = "an astronaut riding a horse on mars" -seed = 2023 - -# The values come from -# https://github.com/lyn-rgb/FreeU_Diffusers#video-pipelines -pipe.enable_freeu(b1=1.2, b2=1.4, s1=0.9, s2=0.2) -video_frames = pipe(prompt, height=320, width=576, num_frames=30, generator=torch.manual_seed(seed)).frames[0] -export_to_video(video_frames, "astronaut_rides_horse.mp4") -``` - -Thanks to [kadirnar](https://github.com/kadirnar/) for helping to integrate the feature, and to [justindujardin](https://github.com/justindujardin) for the helpful discussions. diff --git a/docs/source/en/using-diffusers/image_quality.md b/docs/source/en/using-diffusers/image_quality.md new file mode 100644 index 000000000000..8961f88b904d --- /dev/null +++ b/docs/source/en/using-diffusers/image_quality.md @@ -0,0 +1,190 @@ + + +# Controlling image quality + +The components of a diffusion model, like the UNet and scheduler, can be optimized to improve the quality of generated images leading to better image lighting and details. These techniques are especially useful if you don't have the resources to simply use a larger model for inference. You can enable these techniques during inference without any additional training. + +This guide will show you how to turn these techniques on in your pipeline and how to configure them to improve the quality of your generated images. + +## Lighting + +The Stable Diffusion models aren't very good at generating images that are very bright or dark because the scheduler doesn't start sampling from the last timestep and it doesn't enforce a zero signal-to-noise ratio (SNR). The [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://hf.co/papers/2305.08891) paper fixes these issues which are now available in some Diffusers schedulers. + +> [!TIP] +> For inference, you need a model that has been trained with *v_prediction*. To train your own model with *v_prediction*, add the following flag to the [train_text_to_image.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [train_text_to_image_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts. +> +> ```bash +> --prediction_type="v_prediction" +> ``` + +For example, load the [ptx0/pseudo-journey-v2](https://hf.co/ptx0/pseudo-journey-v2) checkpoint which was trained with `v_prediction` and the [`DDIMScheduler`]. Now you should configure the following parameters in the [`DDIMScheduler`]. + +* `rescale_betas_zero_snr=True` to rescale the noise schedule to zero SNR +* `timestep_spacing="trailing"` to start sampling from the last timestep + +Set `guidance_rescale` in the pipeline to prevent over-exposure. A lower value increases brightness but some of the details may appear washed out. + +```py +from diffusers import DiffusionPipeline, DDIMScheduler + +pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True) + +pipeline.scheduler = DDIMScheduler.from_config( + pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing" +) +pipeline.to("cuda") +prompt = "cinematic photo of a snowy mountain at night with the northern lights aurora borealis overhead, 35mm photograph, film, professional, 4k, highly detailed" +generator = torch.Generator(device="cpu").manual_seed(23) +image = pipeline(prompt, guidance_rescale=0.7, generator=generator).images[0] +image +``` + +
+
+ +
default Stable Diffusion v2-1 image
+
+
+ +
image with zero SNR and trailing timestep spacing enabled
+
+
+ +## Details + +[FreeU](https://hf.co/papers/2309.11497) improves image details by rebalancing the UNet's backbone and skip connection weights. The skip connections can cause the model to overlook some of the backbone semantics which may lead to unnatural image details in the generated image. This technique does not require any additional training and can be applied on the fly during inference for tasks like image-to-image and text-to-video. + +Use the [`~pipelines.StableDiffusionMixin.enable_freeu`] method on your pipeline and configure the scaling factors for the backbone (`b1` and `b2`) and skip connections (`s1` and `s2`). The number after each scaling factor corresponds to the stage in the UNet where the factor is applied. Take a look at the [FreeU](https://github.com/ChenyangSi/FreeU#parameters) repository for reference hyperparameters for different models. + + + + +```py +import torch +from diffusers import DiffusionPipeline + +pipeline = DiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, safety_checker=None +).to("cuda") +pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.5, b2=1.6) +generator = torch.Generator(device="cpu").manual_seed(33) +prompt = "" +image = pipeline(prompt, generator=generator).images[0] +image +``` + +
+
+ +
FreeU disabled
+
+
+ +
FreeU enabled
+
+
+ +
+ + +```py +import torch +from diffusers import DiffusionPipeline + +pipeline = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16, safety_checker=None +).to("cuda") +pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.4, b2=1.6) +generator = torch.Generator(device="cpu").manual_seed(80) +prompt = "A squirrel eating a burger" +image = pipeline(prompt, generator=generator).images[0] +image +``` + +
+
+ +
FreeU disabled
+
+
+ +
FreeU enabled
+
+
+ +
+ + +```py +import torch +from diffusers import DiffusionPipeline + +pipeline = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, +).to("cuda") +pipeline.enable_freeu(s1=0.9, s2=0.2, b1=1.3, b2=1.4) +generator = torch.Generator(device="cpu").manual_seed(13) +prompt = "A squirrel eating a burger" +image = pipeline(prompt, generator=generator).images[0] +image +``` + +
+
+ +
FreeU disabled
+
+
+ +
FreeU enabled
+
+
+ +
+ + +```py +import torch +from diffusers import DiffusionPipeline +from diffusers.utils import export_to_video + +pipeline = DiffusionPipeline.from_pretrained( + "damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16 +).to("cuda") +# values come from https://github.com/lyn-rgb/FreeU_Diffusers#video-pipelines +pipeline.enable_freeu(b1=1.2, b2=1.4, s1=0.9, s2=0.2) +prompt = "Confident teddy bear surfer rides the wave in the tropics" +generator = torch.Generator(device="cpu").manual_seed(47) +video_frames = pipeline(prompt, generator=generator).frames[0] +export_to_video(video_frames, "teddy_bear.mp4", fps=10) +``` + +
+
+ +
FreeU disabled
+
+
+ +
FreeU enabled
+
+
+ +
+
+ +Call the [`pipelines.StableDiffusionMixin.disable_freeu`] method to disable FreeU. + +```py +pipeline.disable_freeu() +``` From ebc99a77aad647c5d33eb36a33c23f7b3949cb40 Mon Sep 17 00:00:00 2001 From: btlorch Date: Fri, 26 Apr 2024 02:44:53 +0200 Subject: [PATCH 06/56] Convert RGB to BGR for the SDXL watermark encoder (#7013) * Convert channel order to BGR for the watermark encoder. Convert the watermarked BGR images back to RGB. Fixes #6292 * Revert channel order before stacking images to overcome limitations that negative strides are currently not supported --------- Co-authored-by: Sayak Paul --- .../pipelines/stable_diffusion_xl/watermark.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/watermark.py b/src/diffusers/pipelines/stable_diffusion_xl/watermark.py index 5b6e36d9f447..f457cdbdb1eb 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/watermark.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/watermark.py @@ -28,9 +28,15 @@ def apply_watermark(self, images: torch.FloatTensor): images = (255 * (images / 2 + 0.5)).cpu().permute(0, 2, 3, 1).float().numpy() - images = [self.encoder.encode(image, "dwtDct") for image in images] + # Convert RGB to BGR, which is the channel order expected by the watermark encoder. + images = images[:, :, :, ::-1] - images = torch.from_numpy(np.array(images)).permute(0, 3, 1, 2) + # Add watermark and convert BGR back to RGB + images = [self.encoder.encode(image, "dwtDct")[:, :, ::-1] for image in images] + + images = np.array(images) + + images = torch.from_numpy(images).permute(0, 3, 1, 2) images = torch.clamp(2 * (images / 255 - 0.5), min=-1.0, max=1.0) return images From e24e54fdfac69df29711c3bf99e88e624395bb13 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Fri, 26 Apr 2024 10:09:36 -0700 Subject: [PATCH 07/56] [docs] Fix AutoPipeline docstring (#7779) fix Co-authored-by: YiYi Xu Co-authored-by: Sayak Paul --- src/diffusers/pipelines/auto_pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py index 79ee3fdad461..dc617e642de8 100644 --- a/src/diffusers/pipelines/auto_pipeline.py +++ b/src/diffusers/pipelines/auto_pipeline.py @@ -216,7 +216,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): ``` Parameters: - pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): + pretrained_model_or_path (`str` or `os.PathLike`, *optional*): Can be either: - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline @@ -489,7 +489,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): ``` Parameters: - pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): + pretrained_model_or_path (`str` or `os.PathLike`, *optional*): Can be either: - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline @@ -765,7 +765,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): ``` Parameters: - pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): + pretrained_model_or_path (`str` or `os.PathLike`, *optional*): Can be either: - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline From 0d2d424fbef933e4b81bea20a660ee6fc8b75ab0 Mon Sep 17 00:00:00 2001 From: Beinsezii <39478211+Beinsezii@users.noreply.github.com> Date: Fri, 26 Apr 2024 12:10:20 -0700 Subject: [PATCH 08/56] Add PixArtSigmaPipeline to AutoPipeline mapping (#7783) --- src/diffusers/pipelines/auto_pipeline.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py index dc617e642de8..c8b682e8afe4 100644 --- a/src/diffusers/pipelines/auto_pipeline.py +++ b/src/diffusers/pipelines/auto_pipeline.py @@ -45,7 +45,7 @@ ) from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline -from .pixart_alpha import PixArtAlphaPipeline +from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline from .stable_cascade import StableCascadeCombinedPipeline, StableCascadeDecoderPipeline from .stable_diffusion import ( StableDiffusionImg2ImgPipeline, @@ -73,7 +73,8 @@ ("wuerstchen", WuerstchenCombinedPipeline), ("cascade", StableCascadeCombinedPipeline), ("lcm", LatentConsistencyModelPipeline), - ("pixart", PixArtAlphaPipeline), + ("pixart-alpha", PixArtAlphaPipeline), + ("pixart-sigma", PixArtSigmaPipeline), ] ) From 8e4ca1b6b2bf21aa031320cb73d31cdaf6fed7bd Mon Sep 17 00:00:00 2001 From: Fabio Rigano Date: Sat, 27 Apr 2024 00:51:11 +0200 Subject: [PATCH 09/56] [Docs] Update image masking and face id example (#7780) * [Docs] Update image masking and face id example * Update docs * Fix docs --- docs/source/en/using-diffusers/ip_adapter.md | 34 +++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/docs/source/en/using-diffusers/ip_adapter.md b/docs/source/en/using-diffusers/ip_adapter.md index e3c4178a1507..ea5f781c625d 100644 --- a/docs/source/en/using-diffusers/ip_adapter.md +++ b/docs/source/en/using-diffusers/ip_adapter.md @@ -277,7 +277,7 @@ images = pipeline( ### IP-Adapter masking -Binary masks specify which portion of the output image should be assigned to an IP-Adapter. This is useful for composing more than one IP-Adapter image. For each input IP-Adapter image, you must provide a binary mask an an IP-Adapter. +Binary masks specify which portion of the output image should be assigned to an IP-Adapter. This is useful for composing more than one IP-Adapter image. For each input IP-Adapter image, you must provide a binary mask. To start, preprocess the input IP-Adapter images with the [`~image_processor.IPAdapterMaskProcessor.preprocess()`] to generate their masks. For optimal results, provide the output height and width to [`~image_processor.IPAdapterMaskProcessor.preprocess()`]. This ensures masks with different aspect ratios are appropriately stretched. If the input masks already match the aspect ratio of the generated image, you don't have to set the `height` and `width`. @@ -305,13 +305,18 @@ masks = processor.preprocess([mask1, mask2], height=output_height, width=output_ -When there is more than one input IP-Adapter image, load them as a list to ensure each image is assigned to a different IP-Adapter. Each of the input IP-Adapter images here correspond to the masks generated above. +When there is more than one input IP-Adapter image, load them as a list and provide the IP-Adapter scale list. Each of the input IP-Adapter images here corresponds to one of the masks generated above. ```py +pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"]) +pipeline.set_ip_adapter_scale([[0.7, 0.7]]) # one scale for each image-mask pair + face_image1 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl1.png") face_image2 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl2.png") -ip_images = [[face_image1], [face_image2]] +ip_images = [[face_image1, face_image2]] + +masks = [masks.reshape(1, masks.shape[0], masks.shape[2], masks.shape[3])] ```
@@ -328,8 +333,6 @@ ip_images = [[face_image1], [face_image2]] Now pass the preprocessed masks to `cross_attention_kwargs` in the pipeline call. ```py -pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"] * 2) -pipeline.set_ip_adapter_scale([0.7] * 2) generator = torch.Generator(device="cpu").manual_seed(0) num_images = 1 @@ -436,7 +439,7 @@ image = torch.from_numpy(faces[0].normed_embedding) ref_images_embeds.append(image.unsqueeze(0)) ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0) neg_ref_images_embeds = torch.zeros_like(ref_images_embeds) -id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda")) +id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda") generator = torch.Generator(device="cpu").manual_seed(42) @@ -452,13 +455,28 @@ images = pipeline( Both IP-Adapter FaceID Plus and Plus v2 models require CLIP image embeddings. You can prepare face embeddings as shown previously, then you can extract and pass CLIP embeddings to the hidden image projection layers. ```py -clip_embeds = pipeline.prepare_ip_adapter_image_embeds([ip_adapter_images], None, torch.device("cuda"), num_images, True)[0] +from insightface.utils import face_align + +ref_images_embeds = [] +ip_adapter_images = [] +app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) +app.prepare(ctx_id=0, det_size=(640, 640)) +image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB) +faces = app.get(image) +ip_adapter_images.append(face_align.norm_crop(image, landmark=faces[0].kps, image_size=224)) +image = torch.from_numpy(faces[0].normed_embedding) +ref_images_embeds.append(image.unsqueeze(0)) +ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0) +neg_ref_images_embeds = torch.zeros_like(ref_images_embeds) +id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda") + +clip_embeds = pipeline.prepare_ip_adapter_image_embeds( + [ip_adapter_images], None, torch.device("cuda"), num_images, True)[0] pipeline.unet.encoder_hid_proj.image_projection_layers[0].clip_embeds = clip_embeds.to(dtype=torch.float16) pipeline.unet.encoder_hid_proj.image_projection_layers[0].shortcut = False # True if Plus v2 ``` - ### Multi IP-Adapter More than one IP-Adapter can be used at the same time to generate specific images in more diverse styles. For example, you can use IP-Adapter-Face to generate consistent faces and characters, and IP-Adapter Plus to generate those faces in a specific style. From 9d16daaf640462a0580dd1d503e71d246809a09a Mon Sep 17 00:00:00 2001 From: "39th president of the United States, probably" <110263573+AmericanPresidentJimmyCarter@users.noreply.github.com> Date: Fri, 26 Apr 2024 21:49:15 -0400 Subject: [PATCH 10/56] Add DREAM training (#6381) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A new function compute_dream_and_update_latents has been added to the training utilities that allows you to do DREAM rectified training in line with the paper https://arxiv.org/abs/2312.00210. The method can be used with an extra argument in the train_text_to_image.py script. Co-authored-by: Jimmy <39@🇺🇸.com> --- examples/text_to_image/README.md | 5 ++ examples/text_to_image/train_text_to_image.py | 28 ++++++++- src/diffusers/training_utils.py | 57 ++++++++++++++++++- 3 files changed, 88 insertions(+), 2 deletions(-) diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md index f2931d3f347e..fd6e50bc3710 100644 --- a/examples/text_to_image/README.md +++ b/examples/text_to_image/README.md @@ -170,6 +170,11 @@ For our small Pokemons dataset, the effects of Min-SNR weighting strategy might Also, note that in this example, we either predict `epsilon` (i.e., the noise) or the `v_prediction`. For both of these cases, the formulation of the Min-SNR weighting strategy that we have used holds. +#### Training with DREAM + +We support training epsilon (noise) prediction models using the [DREAM (Diffusion Rectification and Estimation-Adaptive Models) strategy](https://arxiv.org/abs/2312.00210). DREAM claims to increase model fidelity for the performance cost of an extra grad-less unet `forward` step in the training loop. You can turn on DREAM training by using the `--dream_training` argument. The `--dream_detail_preservation` argument controls the detail preservation variable p and is the default of 1 from the paper. + + ## Training with LoRA Low-Rank Adaption of Large Language Models was first introduced by Microsoft in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*. diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index 84f4c6514cfd..aa704ba8ca38 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -45,7 +45,7 @@ import diffusers from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel from diffusers.optimization import get_scheduler -from diffusers.training_utils import EMAModel, compute_snr +from diffusers.training_utils import EMAModel, compute_dream_and_update_latents, compute_snr from diffusers.utils import check_min_version, deprecate, is_wandb_available, make_image_grid from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card from diffusers.utils.import_utils import is_xformers_available @@ -361,6 +361,20 @@ def parse_args(): help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. " "More details here: https://arxiv.org/abs/2303.09556.", ) + parser.add_argument( + "--dream_training", + action="store_true", + help=( + "Use the DREAM training method, which makes training more efficient and accurate at the ", + "expense of doing an extra forward pass. See: https://arxiv.org/abs/2312.00210", + ), + ) + parser.add_argument( + "--dream_detail_preservation", + type=float, + default=1.0, + help="Dream detail preservation factor p (should be greater than 0; default=1.0, as suggested in the paper)", + ) parser.add_argument( "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." ) @@ -948,6 +962,18 @@ def unwrap_model(model): else: raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") + if args.dream_training: + noisy_latents, target = compute_dream_and_update_latents( + unet, + noise_scheduler, + timesteps, + noise, + noisy_latents, + target, + encoder_hidden_states, + args.dream_detail_preservation, + ) + # Predict the noise residual and compute loss model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0] diff --git a/src/diffusers/training_utils.py b/src/diffusers/training_utils.py index 25e02a3d1492..b617dd2eef39 100644 --- a/src/diffusers/training_utils.py +++ b/src/diffusers/training_utils.py @@ -1,12 +1,13 @@ import contextlib import copy import random -from typing import Any, Dict, Iterable, List, Optional, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import numpy as np import torch from .models import UNet2DConditionModel +from .schedulers import SchedulerMixin from .utils import ( convert_state_dict_to_diffusers, convert_state_dict_to_peft, @@ -117,6 +118,60 @@ def resolve_interpolation_mode(interpolation_type: str): return interpolation_mode +def compute_dream_and_update_latents( + unet: UNet2DConditionModel, + noise_scheduler: SchedulerMixin, + timesteps: torch.Tensor, + noise: torch.Tensor, + noisy_latents: torch.Tensor, + target: torch.Tensor, + encoder_hidden_states: torch.Tensor, + dream_detail_preservation: float = 1.0, +) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Implements "DREAM (Diffusion Rectification and Estimation-Adaptive Models)" from http://arxiv.org/abs/2312.00210. + DREAM helps align training with sampling to help training be more efficient and accurate at the cost of an extra + forward step without gradients. + + Args: + `unet`: The state unet to use to make a prediction. + `noise_scheduler`: The noise scheduler used to add noise for the given timestep. + `timesteps`: The timesteps for the noise_scheduler to user. + `noise`: A tensor of noise in the shape of noisy_latents. + `noisy_latents`: Previously noise latents from the training loop. + `target`: The ground-truth tensor to predict after eps is removed. + `encoder_hidden_states`: Text embeddings from the text model. + `dream_detail_preservation`: A float value that indicates detail preservation level. + See reference. + + Returns: + `tuple[torch.Tensor, torch.Tensor]`: Adjusted noisy_latents and target. + """ + alphas_cumprod = noise_scheduler.alphas_cumprod.to(timesteps.device)[timesteps, None, None, None] + sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5 + + # The paper uses lambda = sqrt(1 - alpha) ** p, with p = 1 in their experiments. + dream_lambda = sqrt_one_minus_alphas_cumprod**dream_detail_preservation + + pred = None + with torch.no_grad(): + pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample + + noisy_latents, target = (None, None) + if noise_scheduler.config.prediction_type == "epsilon": + predicted_noise = pred + delta_noise = (noise - predicted_noise).detach() + delta_noise.mul_(dream_lambda) + noisy_latents = noisy_latents.add(sqrt_one_minus_alphas_cumprod * delta_noise) + target = target.add(delta_noise) + elif noise_scheduler.config.prediction_type == "v_prediction": + raise NotImplementedError("DREAM has not been implemented for v-prediction") + else: + raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") + + return noisy_latents, target + + def unet_lora_state_dict(unet: UNet2DConditionModel) -> Dict[str, torch.Tensor]: r""" Returns: From 56bd7e67c2e01122cc93d98f5bd114f9312a5cce Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Sat, 27 Apr 2024 07:40:35 +0530 Subject: [PATCH 11/56] [Scheduler] introduce sigma schedule. (#7649) * introduce sigma schedule. Co-authored-by: Suraj Patil * address yiyi * update docstrings. * implement the schedule for EDMDPMSolverMultistepScheduler --------- Co-authored-by: Suraj Patil --- .../scheduling_edm_dpmsolver_multistep.py | 34 ++++++++++++++++--- .../schedulers/scheduling_edm_euler.py | 34 ++++++++++++++++--- 2 files changed, 59 insertions(+), 9 deletions(-) diff --git a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py index 26a41d7335c5..dfc7978a2ee2 100644 --- a/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py @@ -14,6 +14,7 @@ # DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver and https://github.com/NVlabs/edm +import math from typing import List, Optional, Tuple, Union import numpy as np @@ -44,6 +45,10 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): range is [0.2, 80.0]. sigma_data (`float`, *optional*, defaults to 0.5): The standard deviation of the data distribution. This is set to 0.5 in the EDM paper [1]. + sigma_schedule (`str`, *optional*, defaults to `karras`): + Sigma schedule to compute the `sigmas`. By default, we the schedule introduced in the EDM paper + (https://arxiv.org/abs/2206.00364). Other acceptable value is "exponential". The exponential schedule was + incorporated in this model: https://huggingface.co/stabilityai/cosxl. num_train_timesteps (`int`, defaults to 1000): The number of diffusion steps to train the model. solver_order (`int`, defaults to 2): @@ -89,6 +94,7 @@ def __init__( sigma_min: float = 0.002, sigma_max: float = 80.0, sigma_data: float = 0.5, + sigma_schedule: str = "karras", num_train_timesteps: int = 1000, prediction_type: str = "epsilon", rho: float = 7.0, @@ -121,7 +127,11 @@ def __init__( ) ramp = torch.linspace(0, 1, num_train_timesteps) - sigmas = self._compute_sigmas(ramp) + if sigma_schedule == "karras": + sigmas = self._compute_karras_sigmas(ramp) + elif sigma_schedule == "exponential": + sigmas = self._compute_exponential_sigmas(ramp) + self.timesteps = self.precondition_noise(sigmas) self.sigmas = self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)]) @@ -236,7 +246,10 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc self.num_inference_steps = num_inference_steps ramp = np.linspace(0, 1, self.num_inference_steps) - sigmas = self._compute_sigmas(ramp) + if self.config.sigma_schedule == "karras": + sigmas = self._compute_karras_sigmas(ramp) + elif self.config.sigma_schedule == "exponential": + sigmas = self._compute_exponential_sigmas(ramp) sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device) self.timesteps = self.precondition_noise(sigmas) @@ -262,10 +275,9 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc self._begin_index = None self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication - # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17 - def _compute_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: + # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas + def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: """Constructs the noise schedule of Karras et al. (2022).""" - sigma_min = sigma_min or self.config.sigma_min sigma_max = sigma_max or self.config.sigma_max @@ -273,6 +285,18 @@ def _compute_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTe min_inv_rho = sigma_min ** (1 / rho) max_inv_rho = sigma_max ** (1 / rho) sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho + + return sigmas + + # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_exponential_sigmas + def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: + """Implementation closely follows k-diffusion. + + https://github.com/crowsonkb/k-diffusion/blob/6ab5146d4a5ef63901326489f31f1d8e7dd36b48/k_diffusion/sampling.py#L26 + """ + sigma_min = sigma_min or self.config.sigma_min + sigma_max = sigma_max or self.config.sigma_max + sigmas = torch.linspace(math.log(sigma_min), math.log(sigma_max), len(ramp)).exp().flip(0) return sigmas # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample diff --git a/src/diffusers/schedulers/scheduling_edm_euler.py b/src/diffusers/schedulers/scheduling_edm_euler.py index f6a09ca1ee16..0ef9263c9e30 100644 --- a/src/diffusers/schedulers/scheduling_edm_euler.py +++ b/src/diffusers/schedulers/scheduling_edm_euler.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import math from dataclasses import dataclass from typing import Optional, Tuple, Union @@ -65,6 +66,10 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin): range is [0.2, 80.0]. sigma_data (`float`, *optional*, defaults to 0.5): The standard deviation of the data distribution. This is set to 0.5 in the EDM paper [1]. + sigma_schedule (`str`, *optional*, defaults to `karras`): + Sigma schedule to compute the `sigmas`. By default, we the schedule introduced in the EDM paper + (https://arxiv.org/abs/2206.00364). Other acceptable value is "exponential". The exponential schedule was + incorporated in this model: https://huggingface.co/stabilityai/cosxl. num_train_timesteps (`int`, defaults to 1000): The number of diffusion steps to train the model. prediction_type (`str`, defaults to `epsilon`, *optional*): @@ -84,15 +89,23 @@ def __init__( sigma_min: float = 0.002, sigma_max: float = 80.0, sigma_data: float = 0.5, + sigma_schedule: str = "karras", num_train_timesteps: int = 1000, prediction_type: str = "epsilon", rho: float = 7.0, ): + if sigma_schedule not in ["karras", "exponential"]: + raise ValueError(f"Wrong value for provided for `{sigma_schedule=}`.`") + # setable values self.num_inference_steps = None ramp = torch.linspace(0, 1, num_train_timesteps) - sigmas = self._compute_sigmas(ramp) + if sigma_schedule == "karras": + sigmas = self._compute_karras_sigmas(ramp) + elif sigma_schedule == "exponential": + sigmas = self._compute_exponential_sigmas(ramp) + self.timesteps = self.precondition_noise(sigmas) self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)]) @@ -200,7 +213,10 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.num_inference_steps = num_inference_steps ramp = np.linspace(0, 1, self.num_inference_steps) - sigmas = self._compute_sigmas(ramp) + if self.config.sigma_schedule == "karras": + sigmas = self._compute_karras_sigmas(ramp) + elif self.config.sigma_schedule == "exponential": + sigmas = self._compute_exponential_sigmas(ramp) sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device) self.timesteps = self.precondition_noise(sigmas) @@ -211,9 +227,8 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17 - def _compute_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: + def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: """Constructs the noise schedule of Karras et al. (2022).""" - sigma_min = sigma_min or self.config.sigma_min sigma_max = sigma_max or self.config.sigma_max @@ -221,6 +236,17 @@ def _compute_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTe min_inv_rho = sigma_min ** (1 / rho) max_inv_rho = sigma_max ** (1 / rho) sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho + + return sigmas + + def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor: + """Implementation closely follows k-diffusion. + + https://github.com/crowsonkb/k-diffusion/blob/6ab5146d4a5ef63901326489f31f1d8e7dd36b48/k_diffusion/sampling.py#L26 + """ + sigma_min = sigma_min or self.config.sigma_min + sigma_max = sigma_max or self.config.sigma_max + sigmas = torch.linspace(math.log(sigma_min), math.log(sigma_max), len(ramp)).exp().flip(0) return sigmas # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep From 50296739878f3e17b2d25d45ef626318b44440b9 Mon Sep 17 00:00:00 2001 From: Jenyuan-Huang <112627523+DannHuang@users.noreply.github.com> Date: Mon, 29 Apr 2024 04:34:57 +0800 Subject: [PATCH 12/56] Update InstantStyle usage in IP-Adapter documentation (#7806) * enable control ip-adapter per-transformer block on-the-fly --------- Co-authored-by: sayakpaul Co-authored-by: ResearcherXman Co-authored-by: YiYi Xu --- docs/source/en/using-diffusers/ip_adapter.md | 22 ++++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/source/en/using-diffusers/ip_adapter.md b/docs/source/en/using-diffusers/ip_adapter.md index ea5f781c625d..02fb0c34aa79 100644 --- a/docs/source/en/using-diffusers/ip_adapter.md +++ b/docs/source/en/using-diffusers/ip_adapter.md @@ -661,16 +661,16 @@ image ### Style & layout control -[InstantStyle](https://arxiv.org/abs/2404.02733) is a plug-and-play method on top of IP-Adapter, which disentangles style and layout from image prompt to control image generation. This is achieved by only inserting IP-Adapters to some specific part of the model. +[InstantStyle](https://arxiv.org/abs/2404.02733) is a plug-and-play method on top of IP-Adapter, which disentangles style and layout from image prompt to control image generation. This way, you can generate images following only the style or layout from image prompt, with significantly improved diversity. This is achieved by only activating IP-Adapters to specific parts of the model. By default IP-Adapters are inserted to all layers of the model. Use the [`~loaders.IPAdapterMixin.set_ip_adapter_scale`] method with a dictionary to assign scales to IP-Adapter at different layers. ```py -from diffusers import AutoPipelineForImage2Image +from diffusers import AutoPipelineForText2Image from diffusers.utils import load_image import torch -pipeline = AutoPipelineForImage2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda") +pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda") pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin") scale = { @@ -680,15 +680,15 @@ scale = { pipeline.set_ip_adapter_scale(scale) ``` -This will activate IP-Adapter at the second layer in the model's down-part block 2 and up-part block 0. The former is the layer where IP-Adapter injects layout information and the latter injects style. Inserting IP-Adapter to these two layers you can generate images following the style and layout of image prompt, but with contents more aligned to text prompt. +This will activate IP-Adapter at the second layer in the model's down-part block 2 and up-part block 0. The former is the layer where IP-Adapter injects layout information and the latter injects style. Inserting IP-Adapter to these two layers you can generate images following both the style and layout from image prompt, but with contents more aligned to text prompt. ```py style_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg") -generator = torch.Generator(device="cpu").manual_seed(42) +generator = torch.Generator(device="cpu").manual_seed(26) image = pipeline( prompt="a cat, masterpiece, best quality, high quality", - image=style_image, + ip_adapter_image=style_image, negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry", guidance_scale=5, num_inference_steps=30, @@ -703,7 +703,7 @@ image
IP-Adapter image
- +
generated image
@@ -718,10 +718,10 @@ scale = { } pipeline.set_ip_adapter_scale(scale) -generator = torch.Generator(device="cpu").manual_seed(42) +generator = torch.Generator(device="cpu").manual_seed(26) image = pipeline( prompt="a cat, masterpiece, best quality, high quality", - image=style_image, + ip_adapter_image=style_image, negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry", guidance_scale=5, num_inference_steps=30, @@ -732,11 +732,11 @@ image
- +
IP-Adapter only in style layer
- +
IP-Adapter in all layers
From 235d34cf567e78bf958344d3132bb018a8580295 Mon Sep 17 00:00:00 2001 From: Nilesh Date: Mon, 29 Apr 2024 06:23:29 +0530 Subject: [PATCH 13/56] Check for latents, before calling prepare_latents - sdxlImg2Img (#7582) * Check for latents, before calling prepare_latents - sdxlImg2Img * Added latents check for all the img2img pipeline * Fixed silly mistake while checking latents as None --- .../clip_guided_stable_diffusion_img2img.py | 13 ++++++++--- .../community/latent_consistency_img2img.py | 23 ++++++++++--------- .../stable_diffusion_controlnet_img2img.py | 19 +++++++-------- ...le_diffusion_controlnet_inpaint_img2img.py | 19 +++++++-------- .../controlnet/pipeline_controlnet_img2img.py | 19 +++++++-------- .../pipeline_controlnet_sd_xl_img2img.py | 21 +++++++++-------- .../pipeline_latent_consistency_img2img.py | 7 +++--- .../shap_e/pipeline_shap_e_img2img.py | 18 +++++++-------- .../pipeline_stable_unclip_img2img.py | 21 +++++++++-------- .../pipeline_stable_diffusion_xl_img2img.py | 22 ++++++++++-------- 10 files changed, 99 insertions(+), 83 deletions(-) diff --git a/examples/community/clip_guided_stable_diffusion_img2img.py b/examples/community/clip_guided_stable_diffusion_img2img.py index 434d5253679a..c8e0a9094f22 100644 --- a/examples/community/clip_guided_stable_diffusion_img2img.py +++ b/examples/community/clip_guided_stable_diffusion_img2img.py @@ -359,9 +359,16 @@ def __call__( # Preprocess image image = preprocess(image, width, height) - latents = self.prepare_latents( - image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, self.device, generator - ) + if latents is None: + latents = self.prepare_latents( + image, + latent_timestep, + batch_size, + num_images_per_prompt, + text_embeddings.dtype, + self.device, + generator, + ) if clip_guidance_scale > 0: if clip_prompt is not None: diff --git a/examples/community/latent_consistency_img2img.py b/examples/community/latent_consistency_img2img.py index 35cd74166c68..98078a2eef96 100644 --- a/examples/community/latent_consistency_img2img.py +++ b/examples/community/latent_consistency_img2img.py @@ -335,17 +335,18 @@ def __call__( # 5. Prepare latent variable num_channels_latents = self.unet.config.in_channels - latents = self.prepare_latents( - image, - latent_timestep, - batch_size * num_images_per_prompt, - num_channels_latents, - height, - width, - prompt_embeds.dtype, - device, - latents, - ) + if latents is None: + latents = self.prepare_latents( + image, + latent_timestep, + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + latents, + ) bs = batch_size * num_images_per_prompt # 6. Get Guidance Scale Embedding diff --git a/examples/community/stable_diffusion_controlnet_img2img.py b/examples/community/stable_diffusion_controlnet_img2img.py index 5f9083616a84..74674e65f0ef 100644 --- a/examples/community/stable_diffusion_controlnet_img2img.py +++ b/examples/community/stable_diffusion_controlnet_img2img.py @@ -802,15 +802,16 @@ def __call__( latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) # 6. Prepare latent variables - latents = self.prepare_latents( - image, - latent_timestep, - batch_size, - num_images_per_prompt, - prompt_embeds.dtype, - device, - generator, - ) + if latents is None: + latents = self.prepare_latents( + image, + latent_timestep, + batch_size, + num_images_per_prompt, + prompt_embeds.dtype, + device, + generator, + ) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) diff --git a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py index d056eb112165..14c4e4aa6d4e 100644 --- a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py +++ b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py @@ -907,15 +907,16 @@ def __call__( latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) # 6. Prepare latent variables - latents = self.prepare_latents( - image, - latent_timestep, - batch_size, - num_images_per_prompt, - prompt_embeds.dtype, - device, - generator, - ) + if latents is None: + latents = self.prepare_latents( + image, + latent_timestep, + batch_size, + num_images_per_prompt, + prompt_embeds.dtype, + device, + generator, + ) mask_image_latents = self.prepare_mask_latents( mask_image, diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index a5a0aaed0f2e..022f30d819d8 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -1169,15 +1169,16 @@ def __call__( self._num_timesteps = len(timesteps) # 6. Prepare latent variables - latents = self.prepare_latents( - image, - latent_timestep, - batch_size, - num_images_per_prompt, - prompt_embeds.dtype, - device, - generator, - ) + if latents is None: + latents = self.prepare_latents( + image, + latent_timestep, + batch_size, + num_images_per_prompt, + prompt_embeds.dtype, + device, + generator, + ) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index d32e7d81649d..d7889a9efbb5 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -1429,16 +1429,17 @@ def __call__( self._num_timesteps = len(timesteps) # 6. Prepare latent variables - latents = self.prepare_latents( - image, - latent_timestep, - batch_size, - num_images_per_prompt, - prompt_embeds.dtype, - device, - generator, - True, - ) + if latents is None: + latents = self.prepare_latents( + image, + latent_timestep, + batch_size, + num_images_per_prompt, + prompt_embeds.dtype, + device, + generator, + True, + ) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py index 8957d7140ef1..fce694d1d0bd 100644 --- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py +++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py @@ -872,9 +872,10 @@ def __call__( else self.scheduler.config.original_inference_steps ) latent_timestep = timesteps[:1] - latents = self.prepare_latents( - image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator - ) + if latents is None: + latents = self.prepare_latents( + image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator + ) bs = batch_size * num_images_per_prompt # 6. Get Guidance Scale Embedding diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py index 02e32633cedb..700ca5db6f07 100644 --- a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py +++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py @@ -239,15 +239,15 @@ def __call__( num_embeddings = self.prior.config.num_embeddings embedding_dim = self.prior.config.embedding_dim - - latents = self.prepare_latents( - (batch_size, num_embeddings * embedding_dim), - image_embeds.dtype, - device, - generator, - latents, - self.scheduler, - ) + if latents is None: + latents = self.prepare_latents( + (batch_size, num_embeddings * embedding_dim), + image_embeds.dtype, + device, + generator, + latents, + self.scheduler, + ) # YiYi notes: for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim latents = latents.reshape(latents.shape[0], num_embeddings, embedding_dim) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index fe19b4de3127..134ec39effc5 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -786,16 +786,17 @@ def __call__( # 6. Prepare latent variables num_channels_latents = self.unet.config.in_channels - latents = self.prepare_latents( - batch_size=batch_size, - num_channels_latents=num_channels_latents, - height=height, - width=width, - dtype=prompt_embeds.dtype, - device=device, - generator=generator, - latents=latents, - ) + if latents is None: + latents = self.prepare_latents( + batch_size=batch_size, + num_channels_latents=num_channels_latents, + height=height, + width=width, + dtype=prompt_embeds.dtype, + device=device, + generator=generator, + latents=latents, + ) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index b72b19d5c1ef..b98ea279c1a2 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -1247,17 +1247,19 @@ def denoising_value_valid(dnv): latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) add_noise = True if self.denoising_start is None else False + # 6. Prepare latent variables - latents = self.prepare_latents( - image, - latent_timestep, - batch_size, - num_images_per_prompt, - prompt_embeds.dtype, - device, - generator, - add_noise, - ) + if latents is None: + latents = self.prepare_latents( + image, + latent_timestep, + batch_size, + num_images_per_prompt, + prompt_embeds.dtype, + device, + generator, + add_noise, + ) # 7. Prepare extra step kwargs. extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) From b1c5817a896ff59604f5ab2b3334df8c5c71ff5b Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 29 Apr 2024 13:44:39 +0530 Subject: [PATCH 14/56] Add debugging workflow (#7778) add debug workflow Co-authored-by: Sayak Paul --- .github/workflows/ssh-runner.yml | 55 ++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 .github/workflows/ssh-runner.yml diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml new file mode 100644 index 000000000000..befebfbc9b96 --- /dev/null +++ b/.github/workflows/ssh-runner.yml @@ -0,0 +1,55 @@ +name: SSH into runners + +on: + workflow_dispatch: + inputs: + runner_type: + description: 'Type of runner to test (a10 or t4)' + required: true + docker_image: + description: 'Name of the Docker image' + required: true + +env: + IS_GITHUB_CI: "1" + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + HF_HOME: /mnt/cache + DIFFUSERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + RUN_SLOW: yes + +jobs: + ssh_runner: + name: "SSH" + runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci] + container: + image: ${{ github.event.inputs.docker_image }} + options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + + steps: + - name: Update clone + working-directory: /diffusers + run: | + git fetch && git checkout ${{ github.sha }} + - name: Cleanup + working-directory: /diffusers + run: | + rm -rf tests/__pycache__ + rm -rf tests/models/__pycache__ + rm -rf reports + - name: Show installed libraries and their versions + working-directory: /diffusers + run: pip freeze + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Tailscale # In order to be able to SSH when a test fails + uses: huggingface/tailscale-action@v1 + with: + authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }} + slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }} + slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + waitForSSH: true From a38dd795120e1884e3396d41bf44e44fd9b1eba0 Mon Sep 17 00:00:00 2001 From: Yushu Date: Mon, 29 Apr 2024 03:54:16 -0700 Subject: [PATCH 15/56] [Pipeline] Fix error of SVD pipeline when num_videos_per_prompt > 1 (#7786) swap the order for do_classifier_free_guidance concat with repeat Co-authored-by: Sayak Paul Co-authored-by: Dhruv Nair --- .../pipeline_stable_video_diffusion.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py index 070183b92409..da6832cebd4d 100644 --- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py +++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py @@ -199,6 +199,9 @@ def _encode_vae_image( image = image.to(device=device) image_latents = self.vae.encode(image).latent_dist.mode() + # duplicate image_latents for each generation per prompt, using mps friendly method + image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1) + if do_classifier_free_guidance: negative_image_latents = torch.zeros_like(image_latents) @@ -207,9 +210,6 @@ def _encode_vae_image( # to avoid doing two forward passes image_latents = torch.cat([negative_image_latents, image_latents]) - # duplicate image_latents for each generation per prompt, using mps friendly method - image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1) - return image_latents def _get_add_time_ids( From eb96ff0d5952f6d64b09bc51a2115de1898e9210 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Mon, 29 Apr 2024 17:36:50 +0530 Subject: [PATCH 16/56] Safetensor loading in AnimateDiff conversion scripts (#7764) * update * update --- scripts/convert_animatediff_motion_lora_to_diffusers.py | 7 +++++-- scripts/convert_animatediff_motion_module_to_diffusers.py | 7 ++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/scripts/convert_animatediff_motion_lora_to_diffusers.py b/scripts/convert_animatediff_motion_lora_to_diffusers.py index 509a7345793c..c680fdc68462 100644 --- a/scripts/convert_animatediff_motion_lora_to_diffusers.py +++ b/scripts/convert_animatediff_motion_lora_to_diffusers.py @@ -1,7 +1,7 @@ import argparse import torch -from safetensors.torch import save_file +from safetensors.torch import load_file, save_file def convert_motion_module(original_state_dict): @@ -34,7 +34,10 @@ def get_args(): if __name__ == "__main__": args = get_args() - state_dict = torch.load(args.ckpt_path, map_location="cpu") + if args.ckpt_path.endswith(".safetensors"): + state_dict = load_file(args.ckpt_path) + else: + state_dict = torch.load(args.ckpt_path, map_location="cpu") if "state_dict" in state_dict.keys(): state_dict = state_dict["state_dict"] diff --git a/scripts/convert_animatediff_motion_module_to_diffusers.py b/scripts/convert_animatediff_motion_module_to_diffusers.py index ceb967acd3d6..e8fb007243fd 100644 --- a/scripts/convert_animatediff_motion_module_to_diffusers.py +++ b/scripts/convert_animatediff_motion_module_to_diffusers.py @@ -1,6 +1,7 @@ import argparse import torch +from safetensors.torch import load_file from diffusers import MotionAdapter @@ -38,7 +39,11 @@ def get_args(): if __name__ == "__main__": args = get_args() - state_dict = torch.load(args.ckpt_path, map_location="cpu") + if args.ckpt_path.endswith(".safetensors"): + state_dict = load_file(args.ckpt_path) + else: + state_dict = torch.load(args.ckpt_path, map_location="cpu") + if "state_dict" in state_dict.keys(): state_dict = state_dict["state_dict"] From 8af793b2d467d0a28f9fed6e07aedd7dc2b9a0ba Mon Sep 17 00:00:00 2001 From: jschoormans Date: Mon, 29 Apr 2024 21:00:53 +0200 Subject: [PATCH 17/56] Adding TextualInversionLoaderMixin for the controlnet_inpaint_sd_xl pipeline (#7288) * added TextualInversionMixIn to controlnet_inpaint_sd_xl pipeline --------- Co-authored-by: YiYi Xu --- .../controlnet/pipeline_controlnet_inpaint_sd_xl.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py index 18c4370b8025..b9c4e3c0032c 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py @@ -151,7 +151,12 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): class StableDiffusionXLControlNetInpaintPipeline( - DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin, IPAdapterMixin + DiffusionPipeline, + StableDiffusionMixin, + StableDiffusionXLLoraLoaderMixin, + FromSingleFileMixin, + IPAdapterMixin, + TextualInversionLoaderMixin, ): r""" Pipeline for text-to-image generation using Stable Diffusion XL. @@ -160,6 +165,7 @@ class StableDiffusionXLControlNetInpaintPipeline( library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) The pipeline also inherits the following loading methods: + - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files From 83ae24ce2d9a080e850c630a1c7050f60e63e3e3 Mon Sep 17 00:00:00 2001 From: RuiningLi <88520323+RuiningLi@users.noreply.github.com> Date: Mon, 29 Apr 2024 21:32:13 +0100 Subject: [PATCH 18/56] Added get_velocity function to EulerDiscreteScheduler. (#7733) * Added get_velocity function to EulerDiscreteScheduler. * Fix white space on blank lines * Added copied from statement * back to the original. --------- Co-authored-by: Ruining Li Co-authored-by: Sayak Paul --- .../schedulers/scheduling_euler_discrete.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py index 1e3252c0bd39..be5bbc235878 100644 --- a/src/diffusers/schedulers/scheduling_euler_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_discrete.py @@ -576,5 +576,44 @@ def add_noise( noisy_samples = original_samples + noise * sigma return noisy_samples + def get_velocity( + self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.FloatTensor + ) -> torch.FloatTensor: + if ( + isinstance(timesteps, int) + or isinstance(timesteps, torch.IntTensor) + or isinstance(timesteps, torch.LongTensor) + ): + raise ValueError( + ( + "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" + " `EulerDiscreteScheduler.get_velocity()` is not supported. Make sure to pass" + " one of the `scheduler.timesteps` as a timestep." + ), + ) + + if sample.device.type == "mps" and torch.is_floating_point(timesteps): + # mps does not support float64 + schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32) + timesteps = timesteps.to(sample.device, dtype=torch.float32) + else: + schedule_timesteps = self.timesteps.to(sample.device) + timesteps = timesteps.to(sample.device) + + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + alphas_cumprod = self.alphas_cumprod.to(sample) + sqrt_alpha_prod = alphas_cumprod[step_indices] ** 0.5 + sqrt_alpha_prod = sqrt_alpha_prod.flatten() + while len(sqrt_alpha_prod.shape) < len(sample.shape): + sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) + + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[step_indices]) ** 0.5 + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() + while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape): + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) + + velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample + return velocity + def __len__(self): return self.config.num_train_timesteps From f53352f750725a4bf4a44220db196c0f26f3ff81 Mon Sep 17 00:00:00 2001 From: Clint Adams <223406+clinty@users.noreply.github.com> Date: Mon, 29 Apr 2024 17:39:59 -0400 Subject: [PATCH 19/56] Set main_input_name in StableDiffusionSafetyChecker to "clip_input" (#7500) FlaxStableDiffusionSafetyChecker sets main_input_name to "clip_input". This makes StableDiffusionSafetyChecker consistent. Co-authored-by: Sayak Paul Co-authored-by: YiYi Xu --- src/diffusers/pipelines/stable_diffusion/safety_checker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/src/diffusers/pipelines/stable_diffusion/safety_checker.py index 6cc4d26f29b4..3e6dec3e0bff 100644 --- a/src/diffusers/pipelines/stable_diffusion/safety_checker.py +++ b/src/diffusers/pipelines/stable_diffusion/safety_checker.py @@ -31,6 +31,7 @@ def cosine_distance(image_embeds, text_embeds): class StableDiffusionSafetyChecker(PreTrainedModel): config_class = CLIPConfig + main_input_name = "clip_input" _no_split_modules = ["CLIPEncoderLayer"] From 31d9f9ea77d7bda61484ef9a29d8453f88c6e28d Mon Sep 17 00:00:00 2001 From: Aritra Roy Gosthipaty Date: Tue, 30 Apr 2024 07:54:38 +0530 Subject: [PATCH 20/56] [Tests] reduce the model size in the ddim fast test (#7803) chore: reducing model size for ddim fast pipeline Co-authored-by: Sayak Paul --- tests/pipelines/ddim/test_ddim.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/pipelines/ddim/test_ddim.py b/tests/pipelines/ddim/test_ddim.py index 0f0654397a34..2078a592ceca 100644 --- a/tests/pipelines/ddim/test_ddim.py +++ b/tests/pipelines/ddim/test_ddim.py @@ -42,9 +42,10 @@ class DDIMPipelineFastTests(PipelineTesterMixin, unittest.TestCase): def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, + block_out_channels=(4, 8), + layers_per_block=1, + norm_num_groups=4, + sample_size=8, in_channels=3, out_channels=3, down_block_types=("DownBlock2D", "AttnDownBlock2D"), @@ -79,10 +80,8 @@ def test_inference(self): image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] - self.assertEqual(image.shape, (1, 32, 32, 3)) - expected_slice = np.array( - [1.000e00, 5.717e-01, 4.717e-01, 1.000e00, 0.000e00, 1.000e00, 3.000e-04, 0.000e00, 9.000e-04] - ) + self.assertEqual(image.shape, (1, 8, 8, 3)) + expected_slice = np.array([0.0, 9.979e-01, 0.0, 9.999e-01, 9.986e-01, 9.991e-01, 7.106e-04, 0.0, 0.0]) max_diff = np.abs(image_slice.flatten() - expected_slice).max() self.assertLessEqual(max_diff, 1e-3) From 21f023ec1acefbe3efa470451838dab4c133e098 Mon Sep 17 00:00:00 2001 From: Aritra Roy Gosthipaty Date: Tue, 30 Apr 2024 08:11:03 +0530 Subject: [PATCH 21/56] [Tests] reduce the model size in the ddpm fast test (#7797) * chore: reducing unet size for faster tests * review suggestions --------- Co-authored-by: Sayak Paul --- tests/pipelines/ddpm/test_ddpm.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/pipelines/ddpm/test_ddpm.py b/tests/pipelines/ddpm/test_ddpm.py index c0cce3a2f237..f6d0821da4c2 100644 --- a/tests/pipelines/ddpm/test_ddpm.py +++ b/tests/pipelines/ddpm/test_ddpm.py @@ -30,9 +30,10 @@ class DDPMPipelineFastTests(unittest.TestCase): def dummy_uncond_unet(self): torch.manual_seed(0) model = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, + block_out_channels=(4, 8), + layers_per_block=1, + norm_num_groups=4, + sample_size=8, in_channels=3, out_channels=3, down_block_types=("DownBlock2D", "AttnDownBlock2D"), @@ -58,10 +59,8 @@ def test_fast_inference(self): image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array( - [9.956e-01, 5.785e-01, 4.675e-01, 9.930e-01, 0.0, 1.000, 1.199e-03, 2.648e-04, 5.101e-04] - ) + assert image.shape == (1, 8, 8, 3) + expected_slice = np.array([0.0, 0.9996672, 0.00329116, 1.0, 0.9995991, 1.0, 0.0060907, 0.00115037, 0.0]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 @@ -83,7 +82,7 @@ def test_inference_predict_sample(self): image_slice = image[0, -3:, -3:, -1] image_eps_slice = image_eps[0, -3:, -3:, -1] - assert image.shape == (1, 32, 32, 3) + assert image.shape == (1, 8, 8, 3) tolerance = 1e-2 if torch_device != "mps" else 3e-2 assert np.abs(image_slice.flatten() - image_eps_slice.flatten()).max() < tolerance From b02e2113ff4625100a4412abd1ae0392ee415364 Mon Sep 17 00:00:00 2001 From: Aritra Roy Gosthipaty Date: Tue, 30 Apr 2024 08:11:26 +0530 Subject: [PATCH 22/56] [Tests] reduce the model size in the amused fast test (#7804) * chore: reducing model sizes * chore: shrinks further * chore: shrinks further * chore: shrinking model for img2img pipeline * chore: reducing size of model for inpaint pipeline --------- Co-authored-by: Sayak Paul --- tests/pipelines/amused/test_amused.py | 36 +++++++++---------- tests/pipelines/amused/test_amused_img2img.py | 36 +++++++++---------- tests/pipelines/amused/test_amused_inpaint.py | 36 +++++++++---------- 3 files changed, 54 insertions(+), 54 deletions(-) diff --git a/tests/pipelines/amused/test_amused.py b/tests/pipelines/amused/test_amused.py index f03751e2f830..9a9e2551d642 100644 --- a/tests/pipelines/amused/test_amused.py +++ b/tests/pipelines/amused/test_amused.py @@ -38,17 +38,17 @@ class AmusedPipelineFastTests(PipelineTesterMixin, unittest.TestCase): def get_dummy_components(self): torch.manual_seed(0) transformer = UVit2DModel( - hidden_size=32, + hidden_size=8, use_bias=False, hidden_dropout=0.0, - cond_embed_dim=32, + cond_embed_dim=8, micro_cond_encode_dim=2, micro_cond_embed_dim=10, - encoder_hidden_size=32, + encoder_hidden_size=8, vocab_size=32, - codebook_size=32, - in_channels=32, - block_out_channels=32, + codebook_size=8, + in_channels=8, + block_out_channels=8, num_res_blocks=1, downsample=True, upsample=True, @@ -56,7 +56,7 @@ def get_dummy_components(self): num_hidden_layers=1, num_attention_heads=1, attention_dropout=0.0, - intermediate_size=32, + intermediate_size=8, layer_norm_eps=1e-06, ln_elementwise_affine=True, ) @@ -64,17 +64,17 @@ def get_dummy_components(self): torch.manual_seed(0) vqvae = VQModel( act_fn="silu", - block_out_channels=[32], + block_out_channels=[8], down_block_types=[ "DownEncoderBlock2D", ], in_channels=3, - latent_channels=32, - layers_per_block=2, - norm_num_groups=32, - num_vq_embeddings=32, + latent_channels=8, + layers_per_block=1, + norm_num_groups=8, + num_vq_embeddings=8, out_channels=3, - sample_size=32, + sample_size=8, up_block_types=[ "UpDecoderBlock2D", ], @@ -85,14 +85,14 @@ def get_dummy_components(self): text_encoder_config = CLIPTextConfig( bos_token_id=0, eos_token_id=2, - hidden_size=32, - intermediate_size=64, + hidden_size=8, + intermediate_size=8, layer_norm_eps=1e-05, - num_attention_heads=8, - num_hidden_layers=3, + num_attention_heads=1, + num_hidden_layers=1, pad_token_id=1, vocab_size=1000, - projection_dim=32, + projection_dim=8, ) text_encoder = CLIPTextModelWithProjection(text_encoder_config) tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") diff --git a/tests/pipelines/amused/test_amused_img2img.py b/tests/pipelines/amused/test_amused_img2img.py index efbca1f437a4..24bc34d330e9 100644 --- a/tests/pipelines/amused/test_amused_img2img.py +++ b/tests/pipelines/amused/test_amused_img2img.py @@ -42,17 +42,17 @@ class AmusedImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): def get_dummy_components(self): torch.manual_seed(0) transformer = UVit2DModel( - hidden_size=32, + hidden_size=8, use_bias=False, hidden_dropout=0.0, - cond_embed_dim=32, + cond_embed_dim=8, micro_cond_encode_dim=2, micro_cond_embed_dim=10, - encoder_hidden_size=32, + encoder_hidden_size=8, vocab_size=32, - codebook_size=32, - in_channels=32, - block_out_channels=32, + codebook_size=8, + in_channels=8, + block_out_channels=8, num_res_blocks=1, downsample=True, upsample=True, @@ -60,7 +60,7 @@ def get_dummy_components(self): num_hidden_layers=1, num_attention_heads=1, attention_dropout=0.0, - intermediate_size=32, + intermediate_size=8, layer_norm_eps=1e-06, ln_elementwise_affine=True, ) @@ -68,17 +68,17 @@ def get_dummy_components(self): torch.manual_seed(0) vqvae = VQModel( act_fn="silu", - block_out_channels=[32], + block_out_channels=[8], down_block_types=[ "DownEncoderBlock2D", ], in_channels=3, - latent_channels=32, - layers_per_block=2, - norm_num_groups=32, - num_vq_embeddings=32, + latent_channels=8, + layers_per_block=1, + norm_num_groups=8, + num_vq_embeddings=32, # reducing this to 16 or 8 -> RuntimeError: "cdist_cuda" not implemented for 'Half' out_channels=3, - sample_size=32, + sample_size=8, up_block_types=[ "UpDecoderBlock2D", ], @@ -89,14 +89,14 @@ def get_dummy_components(self): text_encoder_config = CLIPTextConfig( bos_token_id=0, eos_token_id=2, - hidden_size=32, - intermediate_size=64, + hidden_size=8, + intermediate_size=8, layer_norm_eps=1e-05, - num_attention_heads=8, - num_hidden_layers=3, + num_attention_heads=1, + num_hidden_layers=1, pad_token_id=1, vocab_size=1000, - projection_dim=32, + projection_dim=8, ) text_encoder = CLIPTextModelWithProjection(text_encoder_config) tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") diff --git a/tests/pipelines/amused/test_amused_inpaint.py b/tests/pipelines/amused/test_amused_inpaint.py index d397f8d81297..d0c1ed09c706 100644 --- a/tests/pipelines/amused/test_amused_inpaint.py +++ b/tests/pipelines/amused/test_amused_inpaint.py @@ -42,17 +42,17 @@ class AmusedInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): def get_dummy_components(self): torch.manual_seed(0) transformer = UVit2DModel( - hidden_size=32, + hidden_size=8, use_bias=False, hidden_dropout=0.0, - cond_embed_dim=32, + cond_embed_dim=8, micro_cond_encode_dim=2, micro_cond_embed_dim=10, - encoder_hidden_size=32, + encoder_hidden_size=8, vocab_size=32, - codebook_size=32, - in_channels=32, - block_out_channels=32, + codebook_size=32, # codebook size needs to be consistent with num_vq_embeddings for inpaint tests + in_channels=8, + block_out_channels=8, num_res_blocks=1, downsample=True, upsample=True, @@ -60,7 +60,7 @@ def get_dummy_components(self): num_hidden_layers=1, num_attention_heads=1, attention_dropout=0.0, - intermediate_size=32, + intermediate_size=8, layer_norm_eps=1e-06, ln_elementwise_affine=True, ) @@ -68,17 +68,17 @@ def get_dummy_components(self): torch.manual_seed(0) vqvae = VQModel( act_fn="silu", - block_out_channels=[32], + block_out_channels=[8], down_block_types=[ "DownEncoderBlock2D", ], in_channels=3, - latent_channels=32, - layers_per_block=2, - norm_num_groups=32, - num_vq_embeddings=32, + latent_channels=8, + layers_per_block=1, + norm_num_groups=8, + num_vq_embeddings=32, # reducing this to 16 or 8 -> RuntimeError: "cdist_cuda" not implemented for 'Half' out_channels=3, - sample_size=32, + sample_size=8, up_block_types=[ "UpDecoderBlock2D", ], @@ -89,14 +89,14 @@ def get_dummy_components(self): text_encoder_config = CLIPTextConfig( bos_token_id=0, eos_token_id=2, - hidden_size=32, - intermediate_size=64, + hidden_size=8, + intermediate_size=8, layer_norm_eps=1e-05, - num_attention_heads=8, - num_hidden_layers=3, + num_attention_heads=1, + num_hidden_layers=1, pad_token_id=1, vocab_size=1000, - projection_dim=32, + projection_dim=8, ) text_encoder = CLIPTextModelWithProjection(text_encoder_config) tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") From 3fd31eef518b73ee592f82435f3d370a716ead4f Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Tue, 30 Apr 2024 08:46:51 +0530 Subject: [PATCH 23/56] [Core] introduce _no_split_modules to `ModelMixin` (#6396) * introduce _no_split_modules. * unnecessary spaces. * remove unnecessary kwargs and style * fix: accelerate imports. * change to _determine_device_map * add the blocks that have residual connections. * add: CrossAttnUpBlock2D * add: testin * style * line-spaces * quality * add disk offload test without safetensors. * checking disk offloading percentages. * change model split * add: utility for checking multi-gpu requirement. * model parallelism test * splits. * splits. * splits * splits. * splits. * splits. * offload folder to test_disk_offload_with_safetensors * add _no_split_modules * fix-copies --- .../models/autoencoders/autoencoder_kl.py | 1 + src/diffusers/models/modeling_utils.py | 92 ++++++++++++- .../models/transformers/transformer_2d.py | 1 + .../models/unets/unet_2d_condition.py | 1 + .../versatile_diffusion/modeling_text_unet.py | 1 + tests/models/test_modeling_common.py | 128 ++++++++++++++++++ .../unets/test_models_unet_2d_condition.py | 2 + 7 files changed, 221 insertions(+), 5 deletions(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py index b286453de424..0b9b9d4d47e5 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl.py @@ -65,6 +65,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin): """ _supports_gradient_checkpointing = True + _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D"] @register_to_config def __init__( diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index c1fdff8ab356..8d9f2d9e71fc 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -57,7 +57,8 @@ if is_accelerate_available(): import accelerate - from accelerate.utils import set_module_tensor_to_device + from accelerate import infer_auto_device_map + from accelerate.utils import get_balanced_memory, get_max_memory, set_module_tensor_to_device from accelerate.utils.versions import is_torch_version @@ -99,6 +100,29 @@ def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]: return first_tuple[1].dtype +# Adapted from `transformers` (see modeling_utils.py) +def _determine_device_map(model: "ModelMixin", device_map, max_memory, torch_dtype): + if isinstance(device_map, str): + no_split_modules = model._get_no_split_modules(device_map) + device_map_kwargs = {"no_split_module_classes": no_split_modules} + + if device_map != "sequential": + max_memory = get_balanced_memory( + model, + dtype=torch_dtype, + low_zero=(device_map == "balanced_low_0"), + max_memory=max_memory, + **device_map_kwargs, + ) + else: + max_memory = get_max_memory(max_memory) + + device_map_kwargs["max_memory"] = max_memory + device_map = infer_auto_device_map(model, dtype=torch_dtype, **device_map_kwargs) + + return device_map + + def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[str] = None): """ Reads a checkpoint file, returning properly formatted errors if they arise. @@ -201,6 +225,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"] _supports_gradient_checkpointing = False _keys_to_ignore_on_load_unexpected = None + _no_split_modules = None def __init__(self): super().__init__() @@ -560,6 +585,36 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P " dispatching. Please make sure to set `low_cpu_mem_usage=True`." ) + # change device_map into a map if we passed an int, a str or a torch.device + if isinstance(device_map, torch.device): + device_map = {"": device_map} + elif isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]: + try: + device_map = {"": torch.device(device_map)} + except RuntimeError: + raise ValueError( + "When passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or " + f"'auto', 'balanced', 'balanced_low_0', 'sequential' but found {device_map}." + ) + elif isinstance(device_map, int): + if device_map < 0: + raise ValueError( + "You can't pass device_map as a negative int. If you want to put the model on the cpu, pass device_map = 'cpu' " + ) + else: + device_map = {"": device_map} + + if device_map is not None: + if low_cpu_mem_usage is None: + low_cpu_mem_usage = True + elif not low_cpu_mem_usage: + raise ValueError("Passing along a `device_map` requires `low_cpu_mem_usage=True`") + + if low_cpu_mem_usage: + if device_map is not None and not is_torch_version(">=", "1.10"): + # The max memory utils require PyTorch >= 1.10 to have torch.cuda.mem_get_info. + raise ValueError("`low_cpu_mem_usage` and `device_map` require PyTorch >= 1.10.") + # Load config if we don't provide a configuration config_path = pretrained_model_name_or_path @@ -582,10 +637,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P token=token, revision=revision, subfolder=subfolder, - device_map=device_map, - max_memory=max_memory, - offload_folder=offload_folder, - offload_state_dict=offload_state_dict, user_agent=user_agent, **kwargs, ) @@ -690,6 +741,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P else: # else let accelerate handle loading and dispatching. # Load weights and dispatch according to the device_map # by default the device_map is None and the weights are loaded on the CPU + device_map = _determine_device_map(model, device_map, max_memory, torch_dtype) try: accelerate.load_checkpoint_and_dispatch( model, @@ -881,6 +933,36 @@ def _find_mismatched_keys( return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs + # Adapted from `transformers` modeling_utils.py + def _get_no_split_modules(self, device_map: str): + """ + Get the modules of the model that should not be spit when using device_map. We iterate through the modules to + get the underlying `_no_split_modules`. + + Args: + device_map (`str`): + The device map value. Options are ["auto", "balanced", "balanced_low_0", "sequential"] + + Returns: + `List[str]`: List of modules that should not be split + """ + _no_split_modules = set() + modules_to_check = [self] + while len(modules_to_check) > 0: + module = modules_to_check.pop(-1) + # if the module does not appear in _no_split_modules, we also check the children + if module.__class__.__name__ not in _no_split_modules: + if isinstance(module, ModelMixin): + if module._no_split_modules is None: + raise ValueError( + f"{module.__class__.__name__} does not support `device_map='{device_map}'`. To implement support, the model " + "class needs to implement the `_no_split_modules` attribute." + ) + else: + _no_split_modules = _no_split_modules | set(module._no_split_modules) + modules_to_check += list(module.children()) + return list(_no_split_modules) + @property def device(self) -> torch.device: """ diff --git a/src/diffusers/models/transformers/transformer_2d.py b/src/diffusers/models/transformers/transformer_2d.py index 768fceb71ae5..6a2695b9e436 100644 --- a/src/diffusers/models/transformers/transformer_2d.py +++ b/src/diffusers/models/transformers/transformer_2d.py @@ -72,6 +72,7 @@ class Transformer2DModel(ModelMixin, ConfigMixin): """ _supports_gradient_checkpointing = True + _no_split_modules = ["BasicTransformerBlock"] @register_to_config def __init__( diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py index 34327e1049c5..697730b359ff 100644 --- a/src/diffusers/models/unets/unet_2d_condition.py +++ b/src/diffusers/models/unets/unet_2d_condition.py @@ -161,6 +161,7 @@ class conditioning with `class_embed_type` equal to `None`. """ _supports_gradient_checkpointing = True + _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D", "CrossAttnUpBlock2D"] @register_to_config def __init__( diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py index 3c3bd526692d..c84caa1fad88 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py @@ -363,6 +363,7 @@ class conditioning with `class_embed_type` equal to `None`. """ _supports_gradient_checkpointing = True + _no_split_modules = ["BasicTransformerBlock", "ResnetBlockFlat", "CrossAttnUpBlockFlat"] @register_to_config def __init__( diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index f919ba10fbb7..d8a93d40c8bf 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -24,6 +24,7 @@ import numpy as np import requests_mock import torch +from accelerate.utils import compute_module_sizes from huggingface_hub import ModelCard, delete_repo from huggingface_hub.utils import is_jinja_available from requests.exceptions import HTTPError @@ -39,6 +40,7 @@ require_torch_2, require_torch_accelerator_with_training, require_torch_gpu, + require_torch_multi_gpu, run_test_in_subprocess, torch_device, ) @@ -200,6 +202,21 @@ class ModelTesterMixin: main_input_name = None # overwrite in model specific tester class base_precision = 1e-3 forward_requires_fresh_args = False + model_split_percents = [0.5, 0.7, 0.9] + + def check_device_map_is_respected(self, model, device_map): + for param_name, param in model.named_parameters(): + # Find device in device_map + while len(param_name) > 0 and param_name not in device_map: + param_name = ".".join(param_name.split(".")[:-1]) + if param_name not in device_map: + raise ValueError("device map is incomplete, it does not contain any device for `param_name`.") + + param_device = device_map[param_name] + if param_device in ["cpu", "disk"]: + self.assertEqual(param.device, torch.device("meta")) + else: + self.assertEqual(param.device, torch.device(param_device)) def test_from_save_pretrained(self, expected_max_diff=5e-5): if self.forward_requires_fresh_args: @@ -670,6 +687,117 @@ def test_deprecated_kwargs(self): " from `_deprecated_kwargs = []`" ) + @require_torch_gpu + def test_cpu_offload(self): + config, inputs_dict = self.prepare_init_args_and_inputs_for_common() + model = self.model_class(**config).eval() + model = model.to(torch_device) + + torch.manual_seed(0) + base_output = model(**inputs_dict) + + model_size = compute_module_sizes(model)[""] + # We test several splits of sizes to make sure it works. + max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]] + with tempfile.TemporaryDirectory() as tmp_dir: + model.cpu().save_pretrained(tmp_dir) + + for max_size in max_gpu_sizes: + max_memory = {0: max_size, "cpu": model_size * 2} + new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory) + # Making sure part of the model will actually end up offloaded + self.assertSetEqual(set(new_model.hf_device_map.values()), {0, "cpu"}) + + self.check_device_map_is_respected(new_model, new_model.hf_device_map) + torch.manual_seed(0) + new_output = new_model(**inputs_dict) + + self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5)) + + @require_torch_gpu + def test_disk_offload_without_safetensors(self): + config, inputs_dict = self.prepare_init_args_and_inputs_for_common() + model = self.model_class(**config).eval() + model = model.to(torch_device) + + torch.manual_seed(0) + base_output = model(**inputs_dict) + + model_size = compute_module_sizes(model)[""] + with tempfile.TemporaryDirectory() as tmp_dir: + model.cpu().save_pretrained(tmp_dir, safe_serialization=False) + + with self.assertRaises(ValueError): + max_size = int(self.model_split_percents[1] * model_size) + max_memory = {0: max_size, "cpu": max_size} + # This errors out because it's missing an offload folder + new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory) + + max_size = int(self.model_split_percents[1] * model_size) + max_memory = {0: max_size, "cpu": max_size} + new_model = self.model_class.from_pretrained( + tmp_dir, device_map="auto", max_memory=max_memory, offload_folder=tmp_dir + ) + + self.check_device_map_is_respected(new_model, new_model.hf_device_map) + torch.manual_seed(0) + new_output = new_model(**inputs_dict) + + self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5)) + + @require_torch_gpu + def test_disk_offload_with_safetensors(self): + config, inputs_dict = self.prepare_init_args_and_inputs_for_common() + model = self.model_class(**config).eval() + model = model.to(torch_device) + + torch.manual_seed(0) + base_output = model(**inputs_dict) + + model_size = compute_module_sizes(model)[""] + with tempfile.TemporaryDirectory() as tmp_dir: + model.cpu().save_pretrained(tmp_dir) + + max_size = int(self.model_split_percents[1] * model_size) + max_memory = {0: max_size, "cpu": max_size} + new_model = self.model_class.from_pretrained( + tmp_dir, device_map="auto", offload_folder=tmp_dir, max_memory=max_memory + ) + + self.check_device_map_is_respected(new_model, new_model.hf_device_map) + torch.manual_seed(0) + new_output = new_model(**inputs_dict) + + self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5)) + + @require_torch_multi_gpu + def test_model_parallelism(self): + config, inputs_dict = self.prepare_init_args_and_inputs_for_common() + model = self.model_class(**config).eval() + model = model.to(torch_device) + + torch.manual_seed(0) + base_output = model(**inputs_dict) + + model_size = compute_module_sizes(model)[""] + # We test several splits of sizes to make sure it works. + max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents[1:]] + with tempfile.TemporaryDirectory() as tmp_dir: + model.cpu().save_pretrained(tmp_dir) + + for max_size in max_gpu_sizes: + max_memory = {0: max_size, 1: model_size * 2, "cpu": model_size * 2} + new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory) + # Making sure part of the model will actually end up offloaded + self.assertSetEqual(set(new_model.hf_device_map.values()), {0, 1}) + + self.check_device_map_is_respected(new_model, new_model.hf_device_map) + + torch.manual_seed(0) + new_output = new_model(**inputs_dict) + + self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5)) + @is_staging_test class ModelPushToHubTester(unittest.TestCase): diff --git a/tests/models/unets/test_models_unet_2d_condition.py b/tests/models/unets/test_models_unet_2d_condition.py index 1b8a998cfd66..33aa6a10377b 100644 --- a/tests/models/unets/test_models_unet_2d_condition.py +++ b/tests/models/unets/test_models_unet_2d_condition.py @@ -300,6 +300,8 @@ def create_custom_diffusion_layers(model, mock_weights: bool = True): class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): model_class = UNet2DConditionModel main_input_name = "sample" + # We override the items here because the unet under consideration is small. + model_split_percents = [0.5, 0.3, 0.4] @property def dummy_input(self): From 26a7851e1e0b18da746d6ae80bb105050f7187e0 Mon Sep 17 00:00:00 2001 From: Linoy Tsaban <57615435+linoytsaban@users.noreply.github.com> Date: Tue, 30 Apr 2024 07:16:30 +0300 Subject: [PATCH 24/56] Add B-Lora training option to the advanced dreambooth lora script (#7741) * add blora * add blora * add blora * add blora * little changes * little changes * remove redundancies * fixes * add B LoRA to readme * style * inference * defaults + path to loras+ generation * minor changes * style * minor changes * minor changes * blora arg * added --lora_unet_blocks * style * Update examples/advanced_diffusion_training/README.md Co-authored-by: Sayak Paul * add commit hash to B-LoRA repo cloneing * change inference, remove cloning * change inference, remove cloning add section about configureable unet blocks * change inference, remove cloning add section about configureable unet blocks * Apply suggestions from code review --------- Co-authored-by: Sayak Paul --- .../advanced_diffusion_training/README.md | 143 +++++++++++++++++- .../train_dreambooth_lora_sdxl_advanced.py | 107 +++++++++++-- 2 files changed, 236 insertions(+), 14 deletions(-) diff --git a/examples/advanced_diffusion_training/README.md b/examples/advanced_diffusion_training/README.md index fda73f9ce7a5..a13ae719cfdc 100644 --- a/examples/advanced_diffusion_training/README.md +++ b/examples/advanced_diffusion_training/README.md @@ -234,7 +234,7 @@ In ComfyUI we will load a LoRA and a textual embedding at the same time. SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)). ### DoRA training -The advanced script now supports DoRA training too! +The advanced script supports DoRA training too! > Proposed in [DoRA: Weight-Decomposed Low-Rank Adaptation](https://arxiv.org/abs/2402.09353), **DoRA** is very similar to LoRA, except it decomposes the pre-trained weight into two components, **magnitude** and **direction** and employs LoRA for _directional_ updates to efficiently minimize the number of trainable parameters. The authors found that by using DoRA, both the learning capacity and training stability of LoRA are enhanced without any additional overhead during inference. @@ -304,6 +304,147 @@ accelerate launch train_dreambooth_lora_sdxl_advanced.py \ > [!CAUTION] > Min-SNR gamma is not supported with the EDM-style training yet. When training with the PlaygroundAI model, it's recommended to not pass any "variant". +### B-LoRA training +The advanced script now supports B-LoRA training too! +> Proposed in [Implicit Style-Content Separation using B-LoRA](https://arxiv.org/abs/2403.14572), +B-LoRA is a method that leverages LoRA to implicitly separate the style and content components of a **single** image. +It was shown that learning the LoRA weights of two specific blocks (referred to as B-LoRAs) +achieves style-content separation that cannot be achieved by training each B-LoRA independently. +Once trained, the two B-LoRAs can be used as independent components to allow various image stylization tasks + +**Usage** +Enable B-LoRA training by adding this flag +```bash +--use_blora +``` +You can train a B-LoRA with as little as 1 image, and 1000 steps. Try this default configuration as a start: +```bash +!accelerate launch train_dreambooth_b-lora_sdxl.py \ + --pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0" \ + --instance_data_dir="linoyts/B-LoRA_teddy_bear" \ + --output_dir="B-LoRA_teddy_bear" \ + --instance_prompt="a [v18]" \ + --resolution=1024 \ + --rank=64 \ + --train_batch_size=1 \ + --learning_rate=5e-5 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --max_train_steps=1000 \ + --checkpointing_steps=2000 \ + --seed="0" \ + --gradient_checkpointing \ + --mixed_precision="fp16" +``` +**Inference** +The inference is a bit different: +1. we need load *specific* unet layers (as opposed to a regular LoRA/DoRA) +2. the trained layers we load, changes based on our objective (e.g. style/content) + +```python +import torch +from diffusers import StableDiffusionXLPipeline, AutoencoderKL + +# taken & modified from B-LoRA repo - https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py +def is_belong_to_blocks(key, blocks): + try: + for g in blocks: + if g in key: + return True + return False + except Exception as e: + raise type(e)(f'failed to is_belong_to_block, due to: {e}') + +def lora_lora_unet_blocks(lora_path, alpha, target_blocks): + state_dict, _ = pipeline.lora_state_dict(lora_path) + filtered_state_dict = {k: v * alpha for k, v in state_dict.items() if is_belong_to_blocks(k, target_blocks)} + return filtered_state_dict + +vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16) +pipeline = StableDiffusionXLPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + vae=vae, + torch_dtype=torch.float16, +).to("cuda") + +# pick a blora for content/style (you can also set one to None) +content_B_lora_path = "lora-library/B-LoRA-teddybear" +style_B_lora_path= "lora-library/B-LoRA-pen_sketch" + + +content_B_LoRA = lora_lora_unet_blocks(content_B_lora_path,alpha=1,target_blocks=["unet.up_blocks.0.attentions.0"]) +style_B_LoRA = lora_lora_unet_blocks(style_B_lora_path,alpha=1.1,target_blocks=["unet.up_blocks.0.attentions.1"]) +combined_lora = {**content_B_LoRA, **style_B_LoRA} + +# Load both loras +pipeline.load_lora_into_unet(combined_lora, None, pipeline.unet) + +#generate +prompt = "a [v18] in [v30] style" +pipeline(prompt, num_images_per_prompt=4).images +``` +### LoRA training of Targeted U-net Blocks +The advanced script now supports custom choice of U-net blocks to train during Dreambooth LoRA tuning. +> [!NOTE] +> This feature is still experimental + +> Recently, works like B-LoRA showed the potential advantages of learning the LoRA weights of specific U-net blocks, not only in speed & memory, +> but also in reducing the amount of needed data, improving style manipulation and overcoming overfitting issues. +> In light of this, we're introducing a new feature to the advanced script to allow for configurable U-net learned blocks. + +**Usage** +Configure LoRA learned U-net blocks adding a `lora_unet_blocks` flag, with a comma seperated string specifying the targeted blocks. +e.g: +```bash +--lora_unet_blocks="unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1" +``` + +> [!NOTE] +> if you specify both `--use_blora` and `--lora_unet_blocks`, values given in --lora_unet_blocks will be ignored. +> When enabling --use_blora, targeted U-net blocks are automatically set to be "unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1" as discussed in the paper. +> If you wish to experiment with different blocks, specify `--lora_unet_blocks` only. + +**Inference** +Inference is the same as for B-LoRAs, except the input targeted blocks should be modified based on your training configuration. +```python +import torch +from diffusers import StableDiffusionXLPipeline, AutoencoderKL + +# taken & modified from B-LoRA repo - https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py +def is_belong_to_blocks(key, blocks): + try: + for g in blocks: + if g in key: + return True + return False + except Exception as e: + raise type(e)(f'failed to is_belong_to_block, due to: {e}') + +def lora_lora_unet_blocks(lora_path, alpha, target_blocks): + state_dict, _ = pipeline.lora_state_dict(lora_path) + filtered_state_dict = {k: v * alpha for k, v in state_dict.items() if is_belong_to_blocks(k, target_blocks)} + return filtered_state_dict + +vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16) +pipeline = StableDiffusionXLPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + vae=vae, + torch_dtype=torch.float16, +).to("cuda") + +lora_path = "lora-library/B-LoRA-pen_sketch" + +state_dict = lora_lora_unet_blocks(content_B_lora_path,alpha=1,target_blocks=["unet.up_blocks.0.attentions.0"]) + +# Load traine dlora layers into the unet +pipeline.load_lora_into_unet(state_dict, None, pipeline.unet) + +#generate +prompt = "a dog in [v30] style" +pipeline(prompt, num_images_per_prompt=4).images +``` + + ### Tips and Tricks Check out [these recommended practices](https://huggingface.co/blog/sdxl_lora_advanced_script#additional-good-practices) diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py index 21a84b77245a..0699ac17077d 100644 --- a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py +++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py @@ -15,7 +15,6 @@ import argparse import gc -import hashlib import itertools import json import logging @@ -40,6 +39,7 @@ from accelerate.logging import get_logger from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed from huggingface_hub import create_repo, hf_hub_download, upload_folder +from huggingface_hub.utils import insecure_hashlib from packaging import version from peft import LoraConfig, set_peft_model_state_dict from peft.utils import get_peft_model_state_dict @@ -696,6 +696,23 @@ def parse_args(input_args=None): "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`" ), ) + parser.add_argument( + "--lora_unet_blocks", + type=str, + default=None, + help=( + "the U-net blocks to tune during training. please specify them in a comma separated string, e.g. `unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1` etc." + "NOTE: By default (if not specified) - regular LoRA training is performed. " + "if --use_blora is enabled, this arg will be ignored, since in B-LoRA training, targeted U-net blocks are `unet.up_blocks.0.attentions.0` and `unet.up_blocks.0.attentions.1`" + ), + ) + parser.add_argument( + "--use_blora", + action="store_true", + help=( + "Whether to train a B-LoRA as proposed in- Implicit Style-Content Separation using B-LoRA https://arxiv.org/abs/2403.14572. " + ), + ) parser.add_argument( "--cache_latents", action="store_true", @@ -720,6 +737,11 @@ def parse_args(input_args=None): "For full LoRA text encoder training check --train_text_encoder, for textual " "inversion training check `--train_text_encoder_ti`" ) + if args.use_blora and args.lora_unet_blocks: + warnings.warn( + "You specified both `--use_blora` and `--lora_unet_blocks`, for B-LoRA training, target unet blocks are: `unet.up_blocks.0.attentions.0` and `unet.up_blocks.0.attentions.1`. " + "If you wish to target different U-net blocks, don't enable `--use_blora`" + ) env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) if env_local_rank != -1 and env_local_rank != args.local_rank: @@ -740,6 +762,40 @@ def parse_args(input_args=None): return args +# Taken (and slightly modified) from B-LoRA repo https://github.com/yardenfren1996/B-LoRA/blob/main/blora_utils.py +def is_belong_to_blocks(key, blocks): + try: + for g in blocks: + if g in key: + return True + return False + except Exception as e: + raise type(e)(f"failed to is_belong_to_block, due to: {e}") + + +def get_unet_lora_target_modules(unet, use_blora, target_blocks=None): + if use_blora: + content_b_lora_blocks = "unet.up_blocks.0.attentions.0" + style_b_lora_blocks = "unet.up_blocks.0.attentions.1" + target_blocks = [content_b_lora_blocks, style_b_lora_blocks] + try: + blocks = [(".").join(blk.split(".")[1:]) for blk in target_blocks] + + attns = [ + attn_processor_name.rsplit(".", 1)[0] + for attn_processor_name, _ in unet.attn_processors.items() + if is_belong_to_blocks(attn_processor_name, blocks) + ] + + target_modules = [f"{attn}.{mat}" for mat in ["to_k", "to_q", "to_v", "to_out.0"] for attn in attns] + return target_modules + except Exception as e: + raise type(e)( + f"failed to get_target_modules, due to: {e}. " + f"Please check the modules specified in --lora_unet_blocks are correct" + ) + + # Taken from https://github.com/replicate/cog-sdxl/blob/main/dataset_and_utils.py class TokenEmbeddingsHandler: def __init__(self, text_encoders, tokenizers): @@ -946,16 +1002,20 @@ def __init__( transforms.Normalize([0.5], [0.5]), ] ) + # if using B-LoRA for single image. do not use transformations + single_image = len(self.instance_images) < 2 for image in self.instance_images: - image = exif_transpose(image) + if not single_image: + image = exif_transpose(image) if not image.mode == "RGB": image = image.convert("RGB") self.original_sizes.append((image.height, image.width)) image = train_resize(image) - if args.random_flip and random.random() < 0.5: + + if not single_image and args.random_flip and random.random() < 0.5: # flip image = train_flip(image) - if args.center_crop: + if args.center_crop or single_image: y1 = max(0, int(round((image.height - args.resolution) / 2.0))) x1 = max(0, int(round((image.width - args.resolution) / 2.0))) image = train_crop(image) @@ -1216,7 +1276,7 @@ def main(args): images = pipeline(example["prompt"]).images for i, image in enumerate(images): - hash_image = hashlib.sha1(image.tobytes()).hexdigest() + hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest() image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" image.save(image_filename) @@ -1374,12 +1434,24 @@ def main(args): text_encoder_two.gradient_checkpointing_enable() # now we will add new LoRA weights to the attention layers + + if args.use_blora: + # if using B-LoRA, the targeted blocks to train are automatically set + target_modules = get_unet_lora_target_modules(unet, use_blora=True) + elif args.lora_unet_blocks: + # if training specific unet blocks not in the B-LoRA scheme + target_blocks_list = "".join(args.lora_unet_blocks.split()).split(",") + logger.info(f"list of unet blocks to train: {target_blocks_list}") + target_modules = get_unet_lora_target_modules(unet, use_blora=False, target_blocks=target_blocks_list) + else: + target_modules = ["to_k", "to_q", "to_v", "to_out.0"] + unet_lora_config = LoraConfig( r=args.rank, - lora_alpha=args.rank, use_dora=args.use_dora, + lora_alpha=args.rank, init_lora_weights="gaussian", - target_modules=["to_k", "to_q", "to_v", "to_out.0"], + target_modules=target_modules, ) unet.add_adapter(unet_lora_config) @@ -1388,8 +1460,8 @@ def main(args): if args.train_text_encoder: text_lora_config = LoraConfig( r=args.rank, - lora_alpha=args.rank, use_dora=args.use_dora, + lora_alpha=args.rank, init_lora_weights="gaussian", target_modules=["q_proj", "k_proj", "v_proj", "out_proj"], ) @@ -1505,6 +1577,7 @@ def load_model_hook(models, input_dir): models = [unet_] if args.train_text_encoder: models.extend([text_encoder_one_, text_encoder_two_]) + # only upcast trainable parameters (LoRA) into fp32 cast_training_params(models) accelerator.register_save_state_pre_hook(save_model_hook) @@ -1525,6 +1598,8 @@ def load_model_hook(models, input_dir): models = [unet] if args.train_text_encoder: models.extend([text_encoder_one, text_encoder_two]) + + # only upcast trainable parameters (LoRA) into fp32 cast_training_params(models, dtype=torch.float32) unet_lora_parameters = list(filter(lambda p: p.requires_grad, unet.parameters())) @@ -1780,7 +1855,12 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): # We need to initialize the trackers we use, and also store our configuration. # The trackers initializes automatically on the main process. if accelerator.is_main_process: - accelerator.init_trackers("dreambooth-lora-sd-xl", config=vars(args)) + tracker_name = ( + "dreambooth-lora-sd-xl" + if "playground" not in args.pretrained_model_name_or_path + else "dreambooth-lora-playground" + ) + accelerator.init_trackers(tracker_name, config=vars(args)) # Train! total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps @@ -1833,7 +1913,6 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers): ) def get_sigmas(timesteps, n_dim=4, dtype=torch.float32): - # TODO: revisit other sampling algorithms sigmas = noise_scheduler.sigmas.to(device=accelerator.device, dtype=dtype) schedule_timesteps = noise_scheduler.timesteps.to(accelerator.device) timesteps = timesteps.to(accelerator.device) @@ -1852,6 +1931,7 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32): # flag used for textual inversion pivoted = False for epoch in range(first_epoch, args.num_train_epochs): + unet.train() # if performing any kind of optimization of text_encoder params if args.train_text_encoder or args.train_text_encoder_ti: if epoch == num_train_epochs_text_encoder: @@ -1869,7 +1949,6 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32): text_encoder_one.text_model.embeddings.requires_grad_(True) text_encoder_two.text_model.embeddings.requires_grad_(True) - unet.train() for step, batch in enumerate(train_dataloader): if pivoted: # stopping optimization of text_encoder params @@ -1970,7 +2049,8 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32): timesteps, prompt_embeds_input, added_cond_kwargs=unet_added_conditions, - ).sample + return_dict=False, + )[0] else: unet_added_conditions = {"time_ids": add_time_ids} prompt_embeds, pooled_prompt_embeds = encode_prompt( @@ -1988,7 +2068,8 @@ def get_sigmas(timesteps, n_dim=4, dtype=torch.float32): timesteps, prompt_embeds_input, added_cond_kwargs=unet_added_conditions, - ).sample + return_dict=False, + )[0] weighting = None if args.do_edm_style_training: From 725ead2f5ec6f3f4beac66a5bddcee17647b9599 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 30 Apr 2024 20:14:18 +0530 Subject: [PATCH 25/56] SSH Runner Workflow Update (#7822) * add debug workflow * update --- .github/workflows/ssh-runner.yml | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml index befebfbc9b96..e5bbdd64f549 100644 --- a/.github/workflows/ssh-runner.yml +++ b/.github/workflows/ssh-runner.yml @@ -28,19 +28,10 @@ jobs: options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - name: Update clone - working-directory: /diffusers - run: | - git fetch && git checkout ${{ github.sha }} - - name: Cleanup - working-directory: /diffusers - run: | - rm -rf tests/__pycache__ - rm -rf tests/models/__pycache__ - rm -rf reports - - name: Show installed libraries and their versions - working-directory: /diffusers - run: pip freeze + - name: Checkout diffusers + uses: actions/checkout@v3 + with: + fetch-depth: 2 - name: NVIDIA-SMI run: | From b8ccb462596d336ce892e329ba69fa12394e9964 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tolga=20Cang=C3=B6z?= <46008593+standardAI@users.noreply.github.com> Date: Tue, 30 Apr 2024 20:53:27 +0300 Subject: [PATCH 26/56] Fix CPU offload in docstring (#7827) Fix cpu offload --- .../controlnet/pipeline_controlnet_sd_xl_img2img.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py index d7889a9efbb5..dfd3cc239b36 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py @@ -89,8 +89,8 @@ ... variant="fp16", ... use_safetensors=True, ... torch_dtype=torch.float16, - ... ).to("cuda") - >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to("cuda") + ... ) + >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16) >>> pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained( ... "stabilityai/stable-diffusion-xl-base-1.0", ... controlnet=controlnet, @@ -98,7 +98,7 @@ ... variant="fp16", ... use_safetensors=True, ... torch_dtype=torch.float16, - ... ).to("cuda") + ... ) >>> pipe.enable_model_cpu_offload() From 0d083702637ca61e7dd8533f6a3aa7558fce6d3b Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Tue, 30 Apr 2024 14:10:14 -0700 Subject: [PATCH 27/56] [docs] Community pipelines (#7819) * community pipelines * feedback * consolidate --- docs/source/en/_toctree.yml | 4 - docs/source/en/conceptual/contribution.md | 87 ++++++--- .../en/using-diffusers/contribute_pipeline.md | 184 ------------------ .../custom_pipeline_examples.md | 119 ----------- .../custom_pipeline_overview.md | 101 +++++++++- 5 files changed, 165 insertions(+), 330 deletions(-) delete mode 100644 docs/source/en/using-diffusers/contribute_pipeline.md delete mode 100644 docs/source/en/using-diffusers/custom_pipeline_examples.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 357afb2ea261..f2755798b792 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -87,10 +87,6 @@ title: Shap-E - local: using-diffusers/diffedit title: DiffEdit - - local: using-diffusers/custom_pipeline_examples - title: Community pipelines - - local: using-diffusers/contribute_pipeline - title: Contribute a community pipeline - local: using-diffusers/inference_with_lcm_lora title: Latent Consistency Model-LoRA - local: using-diffusers/inference_with_lcm diff --git a/docs/source/en/conceptual/contribution.md b/docs/source/en/conceptual/contribution.md index 24ac52ba19c9..cc2e0ae07b2c 100644 --- a/docs/source/en/conceptual/contribution.md +++ b/docs/source/en/conceptual/contribution.md @@ -198,38 +198,81 @@ Anything displayed on [the official Diffusers doc page](https://huggingface.co/d Please have a look at [this page](https://github.com/huggingface/diffusers/tree/main/docs) on how to verify changes made to the documentation locally. - ### 6. Contribute a community pipeline -[Pipelines](https://huggingface.co/docs/diffusers/api/pipelines/overview) are usually the first point of contact between the Diffusers library and the user. -Pipelines are examples of how to use Diffusers [models](https://huggingface.co/docs/diffusers/api/models/overview) and [schedulers](https://huggingface.co/docs/diffusers/api/schedulers/overview). -We support two types of pipelines: +> [!TIP] +> Read the [Community pipelines](../using-diffusers/custom_pipeline_overview#community-pipelines) guide to learn more about the difference between a GitHub and Hugging Face Hub community pipeline. If you're interested in why we have community pipelines, take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) (basically, we can't maintain all the possible ways diffusion models can be used for inference but we also don't want to prevent the community from building them). + +Contributing a community pipeline is a great way to share your creativity and work with the community. It lets you build on top of the [`DiffusionPipeline`] so that anyone can load and use it by setting the `custom_pipeline` parameter. This section will walk you through how to create a simple pipeline where the UNet only does a single forward pass and calls the scheduler once (a "one-step" pipeline). + +1. Create a one_step_unet.py file for your community pipeline. This file can contain whatever package you want to use as long as it's installed by the user. Make sure you only have one pipeline class that inherits from [`DiffusionPipeline`] to load model weights and the scheduler configuration from the Hub. Add a UNet and scheduler to the `__init__` function. + + You should also add the `register_modules` function to ensure your pipeline and its components can be saved with [`~DiffusionPipeline.save_pretrained`]. + +```py +from diffusers import DiffusionPipeline +import torch + +class UnetSchedulerOneForwardPipeline(DiffusionPipeline): + def __init__(self, unet, scheduler): + super().__init__() + + self.register_modules(unet=unet, scheduler=scheduler) +``` + +1. In the forward pass (which we recommend defining as `__call__`), you can add any feature you'd like. For the "one-step" pipeline, create a random image and call the UNet and scheduler once by setting `timestep=1`. + +```py + from diffusers import DiffusionPipeline + import torch + + class UnetSchedulerOneForwardPipeline(DiffusionPipeline): + def __init__(self, unet, scheduler): + super().__init__() + + self.register_modules(unet=unet, scheduler=scheduler) -- Official Pipelines -- Community Pipelines + def __call__(self): + image = torch.randn( + (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size), + ) + timestep = 1 + + model_output = self.unet(image, timestep).sample + scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample + + return scheduler_output +``` -Both official and community pipelines follow the same design and consist of the same type of components. +Now you can run the pipeline by passing a UNet and scheduler to it or load pretrained weights if the pipeline structure is identical. + +```py +from diffusers import DDPMScheduler, UNet2DModel + +scheduler = DDPMScheduler() +unet = UNet2DModel() + +pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler) +output = pipeline() +# load pretrained weights +pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True) +output = pipeline() +``` -Official pipelines are tested and maintained by the core maintainers of Diffusers. Their code -resides in [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines). -In contrast, community pipelines are contributed and maintained purely by the **community** and are **not** tested. -They reside in [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) and while they can be accessed via the [PyPI diffusers package](https://pypi.org/project/diffusers/), their code is not part of the PyPI distribution. +You can either share your pipeline as a GitHub community pipeline or Hub community pipeline. -The reason for the distinction is that the core maintainers of the Diffusers library cannot maintain and test all -possible ways diffusion models can be used for inference, but some of them may be of interest to the community. -Officially released diffusion pipelines, -such as Stable Diffusion are added to the core src/diffusers/pipelines package which ensures -high quality of maintenance, no backward-breaking code changes, and testing. -More bleeding edge pipelines should be added as community pipelines. If usage for a community pipeline is high, the pipeline can be moved to the official pipelines upon request from the community. This is one of the ways we strive to be a community-driven library. + + -To add a community pipeline, one should add a .py file to [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) and adapt the [examples/community/README.md](https://github.com/huggingface/diffusers/tree/main/examples/community/README.md) to include an example of the new pipeline. +Share your GitHub pipeline by opening a pull request on the Diffusers [repository](https://github.com/huggingface/diffusers) and add the one_step_unet.py file to the [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) subfolder. -An example can be seen [here](https://github.com/huggingface/diffusers/pull/2400). + + -Community pipeline PRs are only checked at a superficial level and ideally they should be maintained by their original authors. +Share your Hub pipeline by creating a model repository on the Hub and uploading the one_step_unet.py file to it. -Contributing a community pipeline is a great way to understand how Diffusers models and schedulers work. Having contributed a community pipeline is usually the first stepping stone to contributing an official pipeline to the -core package. + + ### 7. Contribute to training examples diff --git a/docs/source/en/using-diffusers/contribute_pipeline.md b/docs/source/en/using-diffusers/contribute_pipeline.md deleted file mode 100644 index e9cf1ed1ce02..000000000000 --- a/docs/source/en/using-diffusers/contribute_pipeline.md +++ /dev/null @@ -1,184 +0,0 @@ - - -# Contribute a community pipeline - - - -💡 Take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) for more context about why we're adding community pipelines to help everyone easily share their work without being slowed down. - - - -Community pipelines allow you to add any additional features you'd like on top of the [`DiffusionPipeline`]. The main benefit of building on top of the `DiffusionPipeline` is anyone can load and use your pipeline by only adding one more argument, making it super easy for the community to access. - -This guide will show you how to create a community pipeline and explain how they work. To keep things simple, you'll create a "one-step" pipeline where the `UNet` does a single forward pass and calls the scheduler once. - -## Initialize the pipeline - -You should start by creating a `one_step_unet.py` file for your community pipeline. In this file, create a pipeline class that inherits from the [`DiffusionPipeline`] to be able to load model weights and the scheduler configuration from the Hub. The one-step pipeline needs a `UNet` and a scheduler, so you'll need to add these as arguments to the `__init__` function: - -```python -from diffusers import DiffusionPipeline -import torch - -class UnetSchedulerOneForwardPipeline(DiffusionPipeline): - def __init__(self, unet, scheduler): - super().__init__() -``` - -To ensure your pipeline and its components (`unet` and `scheduler`) can be saved with [`~DiffusionPipeline.save_pretrained`], add them to the `register_modules` function: - -```diff - from diffusers import DiffusionPipeline - import torch - - class UnetSchedulerOneForwardPipeline(DiffusionPipeline): - def __init__(self, unet, scheduler): - super().__init__() - -+ self.register_modules(unet=unet, scheduler=scheduler) -``` - -Cool, the `__init__` step is done and you can move to the forward pass now! 🔥 - -## Define the forward pass - -In the forward pass, which we recommend defining as `__call__`, you have complete creative freedom to add whatever feature you'd like. For our amazing one-step pipeline, create a random image and only call the `unet` and `scheduler` once by setting `timestep=1`: - -```diff - from diffusers import DiffusionPipeline - import torch - - class UnetSchedulerOneForwardPipeline(DiffusionPipeline): - def __init__(self, unet, scheduler): - super().__init__() - - self.register_modules(unet=unet, scheduler=scheduler) - -+ def __call__(self): -+ image = torch.randn( -+ (1, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size), -+ ) -+ timestep = 1 - -+ model_output = self.unet(image, timestep).sample -+ scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample - -+ return scheduler_output -``` - -That's it! 🚀 You can now run this pipeline by passing a `unet` and `scheduler` to it: - -```python -from diffusers import DDPMScheduler, UNet2DModel - -scheduler = DDPMScheduler() -unet = UNet2DModel() - -pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler) - -output = pipeline() -``` - -But what's even better is you can load pre-existing weights into the pipeline if the pipeline structure is identical. For example, you can load the [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32) weights into the one-step pipeline: - -```python -pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True) - -output = pipeline() -``` - -## Share your pipeline - -Open a Pull Request on the 🧨 Diffusers [repository](https://github.com/huggingface/diffusers) to add your awesome pipeline in `one_step_unet.py` to the [examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) subfolder. - -Once it is merged, anyone with `diffusers >= 0.4.0` installed can use this pipeline magically 🪄 by specifying it in the `custom_pipeline` argument: - -```python -from diffusers import DiffusionPipeline - -pipe = DiffusionPipeline.from_pretrained( - "google/ddpm-cifar10-32", custom_pipeline="one_step_unet", use_safetensors=True -) -pipe() -``` - -Another way to share your community pipeline is to upload the `one_step_unet.py` file directly to your preferred [model repository](https://huggingface.co/docs/hub/models-uploading) on the Hub. Instead of specifying the `one_step_unet.py` file, pass the model repository id to the `custom_pipeline` argument: - -```python -from diffusers import DiffusionPipeline - -pipeline = DiffusionPipeline.from_pretrained( - "google/ddpm-cifar10-32", custom_pipeline="stevhliu/one_step_unet", use_safetensors=True -) -``` - -Take a look at the following table to compare the two sharing workflows to help you decide the best option for you: - -| | GitHub community pipeline | HF Hub community pipeline | -|----------------|------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------| -| usage | same | same | -| review process | open a Pull Request on GitHub and undergo a review process from the Diffusers team before merging; may be slower | upload directly to a Hub repository without any review; this is the fastest workflow | -| visibility | included in the official Diffusers repository and documentation | included on your HF Hub profile and relies on your own usage/promotion to gain visibility | - - - -💡 You can use whatever package you want in your community pipeline file - as long as the user has it installed, everything will work fine. Make sure you have one and only one pipeline class that inherits from `DiffusionPipeline` because this is automatically detected. - - - -## How do community pipelines work? - -A community pipeline is a class that inherits from [`DiffusionPipeline`] which means: - -- It can be loaded with the [`custom_pipeline`] argument. -- The model weights and scheduler configuration are loaded from [`pretrained_model_name_or_path`]. -- The code that implements a feature in the community pipeline is defined in a `pipeline.py` file. - -Sometimes you can't load all the pipeline components weights from an official repository. In this case, the other components should be passed directly to the pipeline: - -```python -from diffusers import DiffusionPipeline -from transformers import CLIPImageProcessor, CLIPModel - -model_id = "CompVis/stable-diffusion-v1-4" -clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K" - -feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id) -clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16) - -pipeline = DiffusionPipeline.from_pretrained( - model_id, - custom_pipeline="clip_guided_stable_diffusion", - clip_model=clip_model, - feature_extractor=feature_extractor, - scheduler=scheduler, - torch_dtype=torch.float16, - use_safetensors=True, -) -``` - -The magic behind community pipelines is contained in the following code. It allows the community pipeline to be loaded from GitHub or the Hub, and it'll be available to all 🧨 Diffusers packages. - -```python -# 2. Load the pipeline class, if using custom module then load it from the Hub -# if we load from explicit class, let's use it -if custom_pipeline is not None: - pipeline_class = get_class_from_dynamic_module( - custom_pipeline, module_file=CUSTOM_PIPELINE_FILE_NAME, cache_dir=custom_pipeline - ) -elif cls != DiffusionPipeline: - pipeline_class = cls -else: - diffusers_module = importlib.import_module(cls.__module__.split(".")[0]) - pipeline_class = getattr(diffusers_module, config_dict["_class_name"]) -``` diff --git a/docs/source/en/using-diffusers/custom_pipeline_examples.md b/docs/source/en/using-diffusers/custom_pipeline_examples.md deleted file mode 100644 index 203302ed3ead..000000000000 --- a/docs/source/en/using-diffusers/custom_pipeline_examples.md +++ /dev/null @@ -1,119 +0,0 @@ - - -# Community pipelines - -[[open-in-colab]] - - - -For more context about the design choices behind community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841). - - - -Community pipelines allow you to get creative and build your own unique pipelines to share with the community. You can find all community pipelines in the [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) folder along with inference and training examples for how to use them. This guide showcases some of the community pipelines and hopefully it'll inspire you to create your own (feel free to open a PR with your own pipeline and we will merge it!). - -To load a community pipeline, use the `custom_pipeline` argument in [`DiffusionPipeline`] to specify one of the files in [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community): - -```py -from diffusers import DiffusionPipeline - -pipe = DiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", custom_pipeline="filename_in_the_community_folder", use_safetensors=True -) -``` - -If a community pipeline doesn't work as expected, please open a GitHub issue and mention the author. - -You can learn more about community pipelines in the how to [load community pipelines](custom_pipeline_overview) and how to [contribute a community pipeline](contribute_pipeline) guides. - -## Multilingual Stable Diffusion - -The multilingual Stable Diffusion pipeline uses a pretrained [XLM-RoBERTa](https://huggingface.co/papluca/xlm-roberta-base-language-detection) to identify a language and the [mBART-large-50](https://huggingface.co/facebook/mbart-large-50-many-to-one-mmt) model to handle the translation. This allows you to generate images from text in 20 languages. - -```py -import torch -from diffusers import DiffusionPipeline -from diffusers.utils import make_image_grid -from transformers import ( - pipeline, - MBart50TokenizerFast, - MBartForConditionalGeneration, -) - -device = "cuda" if torch.cuda.is_available() else "cpu" -device_dict = {"cuda": 0, "cpu": -1} - -# add language detection pipeline -language_detection_model_ckpt = "papluca/xlm-roberta-base-language-detection" -language_detection_pipeline = pipeline("text-classification", - model=language_detection_model_ckpt, - device=device_dict[device]) - -# add model for language translation -translation_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-one-mmt") -translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-one-mmt").to(device) - -diffuser_pipeline = DiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - custom_pipeline="multilingual_stable_diffusion", - detection_pipeline=language_detection_pipeline, - translation_model=translation_model, - translation_tokenizer=translation_tokenizer, - torch_dtype=torch.float16, -) - -diffuser_pipeline.enable_attention_slicing() -diffuser_pipeline = diffuser_pipeline.to(device) - -prompt = ["a photograph of an astronaut riding a horse", - "Una casa en la playa", - "Ein Hund, der Orange isst", - "Un restaurant parisien"] - -images = diffuser_pipeline(prompt).images -make_image_grid(images, rows=2, cols=2) -``` - -
- -
- -## MagicMix - -[MagicMix](https://huggingface.co/papers/2210.16056) is a pipeline that can mix an image and text prompt to generate a new image that preserves the image structure. The `mix_factor` determines how much influence the prompt has on the layout generation, `kmin` controls the number of steps during the content generation process, and `kmax` determines how much information is kept in the layout of the original image. - -```py -from diffusers import DiffusionPipeline, DDIMScheduler -from diffusers.utils import load_image, make_image_grid - -pipeline = DiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - custom_pipeline="magic_mix", - scheduler=DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler"), -).to('cuda') - -img = load_image("https://user-images.githubusercontent.com/59410571/209578593-141467c7-d831-4792-8b9a-b17dc5e47816.jpg") -mix_img = pipeline(img, prompt="bed", kmin=0.3, kmax=0.5, mix_factor=0.5) -make_image_grid([img, mix_img], rows=1, cols=2) -``` - -
-
- -
original image
-
-
- -
image and text prompt mix
-
-
diff --git a/docs/source/en/using-diffusers/custom_pipeline_overview.md b/docs/source/en/using-diffusers/custom_pipeline_overview.md index 0b6bb53f10d6..ef26e546e4d4 100644 --- a/docs/source/en/using-diffusers/custom_pipeline_overview.md +++ b/docs/source/en/using-diffusers/custom_pipeline_overview.md @@ -16,11 +16,19 @@ specific language governing permissions and limitations under the License. ## Community pipelines +> [!TIP] Take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) for more context about why we're adding community pipelines to help everyone easily share their work without being slowed down. + Community pipelines are any [`DiffusionPipeline`] class that are different from the original paper implementation (for example, the [`StableDiffusionControlNetPipeline`] corresponds to the [Text-to-Image Generation with ControlNet Conditioning](https://arxiv.org/abs/2302.05543) paper). They provide additional functionality or extend the original implementation of a pipeline. There are many cool community pipelines like [Marigold Depth Estimation](https://github.com/huggingface/diffusers/tree/main/examples/community#marigold-depth-estimation) or [InstantID](https://github.com/huggingface/diffusers/tree/main/examples/community#instantid-pipeline), and you can find all the official community pipelines [here](https://github.com/huggingface/diffusers/tree/main/examples/community). -There are two types of community pipelines, those stored on the Hugging Face Hub and those stored on Diffusers GitHub repository. Hub pipelines are completely customizable (scheduler, models, pipeline code, etc.) while Diffusers GitHub pipelines are only limited to custom pipeline code. Refer to this [table](./contribute_pipeline#share-your-pipeline) for a more detailed comparison of Hub vs GitHub community pipelines. +There are two types of community pipelines, those stored on the Hugging Face Hub and those stored on Diffusers GitHub repository. Hub pipelines are completely customizable (scheduler, models, pipeline code, etc.) while Diffusers GitHub pipelines are only limited to custom pipeline code. + +| | GitHub community pipeline | HF Hub community pipeline | +|----------------|------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------| +| usage | same | same | +| review process | open a Pull Request on GitHub and undergo a review process from the Diffusers team before merging; may be slower | upload directly to a Hub repository without any review; this is the fastest workflow | +| visibility | included in the official Diffusers repository and documentation | included on your HF Hub profile and relies on your own usage/promotion to gain visibility | @@ -161,6 +169,97 @@ out_lpw +## Example community pipelines + +Community pipelines are a really fun and creative way to extend the capabilities of the original pipeline with new and unique features. You can find all community pipelines in the [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) folder with inference and training examples for how to use them. + +This section showcases a couple of the community pipelines and hopefully it'll inspire you to create your own (feel free to open a PR for your community pipeline and ping us for a review)! + +> [!TIP] +> The [`~DiffusionPipeline.from_pipe`] method is particularly useful for loading community pipelines because many of them don't have pretrained weights and add a feature on top of an existing pipeline like Stable Diffusion or Stable Diffusion XL. You can learn more about the [`~DiffusionPipeline.from_pipe`] method in the [Load with from_pipe](custom_pipeline_overview#load-with-from_pipe) section. + + + + +[Marigold](https://marigoldmonodepth.github.io/) is a depth estimation diffusion pipeline that uses the rich existing and inherent visual knowledge in diffusion models. It takes an input image and denoises and decodes it into a depth map. Marigold performs well even on images it hasn't seen before. + +```py +import torch +from PIL import Image +from diffusers import DiffusionPipeline +from diffusers.utils import load_image + +pipeline = DiffusionPipeline.from_pretrained( + "prs-eth/marigold-lcm-v1-0", + custom_pipeline="marigold_depth_estimation", + torch_dtype=torch.float16, + variant="fp16", +) + +pipeline.to("cuda") +image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/community-marigold.png") +output = pipeline( + image, + denoising_steps=4, + ensemble_size=5, + processing_res=768, + match_input_res=True, + batch_size=0, + seed=33, + color_map="Spectral", + show_progress_bar=True, +) +depth_colored: Image.Image = output.depth_colored +depth_colored.save("./depth_colored.png") +``` + +
+
+ +
original image
+
+
+ +
colorized depth image
+
+
+ +
+ + +[HD-Painter](https://hf.co/papers/2312.14091) is a high-resolution inpainting pipeline. It introduces a *Prompt-Aware Introverted Attention (PAIntA)* layer to better align a prompt with the area to be inpainted, and *Reweighting Attention Score Guidance (RASG)* to keep the latents more prompt-aligned and within their trained domain to generate realistc images. + +```py +import torch +from diffusers import DiffusionPipeline, DDIMScheduler +from diffusers.utils import load_image + +pipeline = DiffusionPipeline.from_pretrained( + "Lykon/dreamshaper-8-inpainting", + custom_pipeline="hd_painter" +) +pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config) +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter.jpg") +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter-mask.png") +prompt = "football" +image = pipeline(prompt, init_image, mask_image, use_rasg=True, use_painta=True, generator=torch.manual_seed(0)).images[0] +image +``` + +
+
+ +
original image
+
+
+ +
generated image
+
+
+ +
+
+ ## Community components Community components allow users to build pipelines that may have customized components that are not a part of Diffusers. If your pipeline has custom components that Diffusers doesn't already support, you need to provide their implementations as Python modules. These customized components could be a VAE, UNet, and scheduler. In most cases, the text encoder is imported from the Transformers library. The pipeline code itself can also be customized. From c1edb03c372c12f92b1b5580b718e8cf2196016c Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 1 May 2024 17:36:54 +0530 Subject: [PATCH 28/56] Fix for pipeline slow test fetcher (#7824) * update * update --- .github/workflows/nightly_tests.yml | 44 ++++++++++++++--------------- .github/workflows/push_tests.yml | 2 +- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml index 2f73c66de829..d911dab4a306 100644 --- a/.github/workflows/nightly_tests.yml +++ b/.github/workflows/nightly_tests.yml @@ -19,7 +19,7 @@ env: jobs: setup_torch_cuda_pipeline_matrix: name: Setup Torch Pipelines Matrix - runs-on: ubuntu-latest + runs-on: diffusers/diffusers-pytorch-cpu outputs: pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }} steps: @@ -67,19 +67,19 @@ jobs: fetch-depth: 2 - name: NVIDIA-SMI run: nvidia-smi - + - name: Install dependencies run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git python -m uv pip install pytest-reportlog - + - name: Environment run: | python utils/print_env.py - - - name: Nightly PyTorch CUDA checkpoint (pipelines) tests + + - name: Nightly PyTorch CUDA checkpoint (pipelines) tests env: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms @@ -88,9 +88,9 @@ jobs: python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \ -s -v -k "not Flax and not Onnx" \ --make-reports=tests_pipeline_${{ matrix.module }}_cuda \ - --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \ + --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \ tests/pipelines/${{ matrix.module }} - + - name: Failure short reports if: ${{ failure() }} run: | @@ -103,7 +103,7 @@ jobs: with: name: pipeline_${{ matrix.module }}_test_reports path: reports - + - name: Generate Report and Notify Channel if: always() run: | @@ -139,7 +139,7 @@ jobs: run: python utils/print_env.py - name: Run nightly PyTorch CUDA tests for non-pipeline modules - if: ${{ matrix.module != 'examples'}} + if: ${{ matrix.module != 'examples'}} env: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms @@ -148,7 +148,7 @@ jobs: python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \ -s -v -k "not Flax and not Onnx" \ --make-reports=tests_torch_${{ matrix.module }}_cuda \ - --report-log=tests_torch_${{ matrix.module }}_cuda.log \ + --report-log=tests_torch_${{ matrix.module }}_cuda.log \ tests/${{ matrix.module }} - name: Run nightly example tests with Torch @@ -161,13 +161,13 @@ jobs: python -m uv pip install peft@git+https://github.com/huggingface/peft.git python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \ -s -v --make-reports=examples_torch_cuda \ - --report-log=examples_torch_cuda.log \ + --report-log=examples_torch_cuda.log \ examples/ - name: Failure short reports if: ${{ failure() }} run: | - cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt + cat reports/tests_torch_${{ matrix.module }}_cuda_stats.txt cat reports/tests_torch_${{ matrix.module }}_cuda_failures_short.txt - name: Test suite reports artifacts @@ -218,13 +218,13 @@ jobs: python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \ -s -v -k "not Flax and not Onnx" \ --make-reports=tests_torch_lora_cuda \ - --report-log=tests_torch_lora_cuda.log \ + --report-log=tests_torch_lora_cuda.log \ tests/lora - + - name: Failure short reports if: ${{ failure() }} run: | - cat reports/tests_torch_lora_cuda_stats.txt + cat reports/tests_torch_lora_cuda_stats.txt cat reports/tests_torch_lora_cuda_failures_short.txt - name: Test suite reports artifacts @@ -239,12 +239,12 @@ jobs: run: | pip install slack_sdk tabulate python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY - + run_flax_tpu_tests: name: Nightly Flax TPU Tests runs-on: docker-tpu if: github.event_name == 'schedule' - + container: image: diffusers/diffusers-flax-tpu options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged @@ -274,7 +274,7 @@ jobs: python -m pytest -n 0 \ -s -v -k "Flax" \ --make-reports=tests_flax_tpu \ - --report-log=tests_flax_tpu.log \ + --report-log=tests_flax_tpu.log \ tests/ - name: Failure short reports @@ -302,7 +302,7 @@ jobs: container: image: diffusers/diffusers-onnxruntime-cuda options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ - + steps: - name: Checkout diffusers uses: actions/checkout@v3 @@ -321,7 +321,7 @@ jobs: - name: Environment run: python utils/print_env.py - + - name: Run nightly ONNXRuntime CUDA tests env: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} @@ -329,7 +329,7 @@ jobs: python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \ -s -v -k "Onnx" \ --make-reports=tests_onnx_cuda \ - --report-log=tests_onnx_cuda.log \ + --report-log=tests_onnx_cuda.log \ tests/ - name: Failure short reports @@ -344,7 +344,7 @@ jobs: with: name: ${{ matrix.config.report }}_test_reports path: reports - + - name: Generate Report and Notify Channel if: always() run: | diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index a6cb123a7035..d5e1c4739497 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -21,7 +21,7 @@ env: jobs: setup_torch_cuda_pipeline_matrix: name: Setup Torch Pipelines CUDA Slow Tests Matrix - runs-on: ubuntu-latest + runs-on: diffusers/diffusers-pytorch-cpu outputs: pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }} steps: From 8909ab4b192500bcc3c17d839ae101cc669e9d8e Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 1 May 2024 18:45:47 +0530 Subject: [PATCH 29/56] [Tests] fix: device map tests for models (#7825) * fix: device module tests * remove patch file * Empty-Commit --- tests/models/test_modeling_common.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index d8a93d40c8bf..d9e70c6dd784 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -691,6 +691,9 @@ def test_deprecated_kwargs(self): def test_cpu_offload(self): config, inputs_dict = self.prepare_init_args_and_inputs_for_common() model = self.model_class(**config).eval() + if model._no_split_modules is None: + return + model = model.to(torch_device) torch.manual_seed(0) @@ -718,6 +721,9 @@ def test_cpu_offload(self): def test_disk_offload_without_safetensors(self): config, inputs_dict = self.prepare_init_args_and_inputs_for_common() model = self.model_class(**config).eval() + if model._no_split_modules is None: + return + model = model.to(torch_device) torch.manual_seed(0) @@ -728,12 +734,12 @@ def test_disk_offload_without_safetensors(self): model.cpu().save_pretrained(tmp_dir, safe_serialization=False) with self.assertRaises(ValueError): - max_size = int(self.model_split_percents[1] * model_size) + max_size = int(self.model_split_percents[0] * model_size) max_memory = {0: max_size, "cpu": max_size} # This errors out because it's missing an offload folder new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory) - max_size = int(self.model_split_percents[1] * model_size) + max_size = int(self.model_split_percents[0] * model_size) max_memory = {0: max_size, "cpu": max_size} new_model = self.model_class.from_pretrained( tmp_dir, device_map="auto", max_memory=max_memory, offload_folder=tmp_dir @@ -749,6 +755,9 @@ def test_disk_offload_without_safetensors(self): def test_disk_offload_with_safetensors(self): config, inputs_dict = self.prepare_init_args_and_inputs_for_common() model = self.model_class(**config).eval() + if model._no_split_modules is None: + return + model = model.to(torch_device) torch.manual_seed(0) @@ -758,7 +767,7 @@ def test_disk_offload_with_safetensors(self): with tempfile.TemporaryDirectory() as tmp_dir: model.cpu().save_pretrained(tmp_dir) - max_size = int(self.model_split_percents[1] * model_size) + max_size = int(self.model_split_percents[0] * model_size) max_memory = {0: max_size, "cpu": max_size} new_model = self.model_class.from_pretrained( tmp_dir, device_map="auto", offload_folder=tmp_dir, max_memory=max_memory @@ -774,6 +783,9 @@ def test_disk_offload_with_safetensors(self): def test_model_parallelism(self): config, inputs_dict = self.prepare_init_args_and_inputs_for_common() model = self.model_class(**config).eval() + if model._no_split_modules is None: + return + model = model.to(torch_device) torch.manual_seed(0) From 21a7ff12a75ecf43a85898838d1990cda853ffaf Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Wed, 1 May 2024 06:25:57 -1000 Subject: [PATCH 30/56] update the logic of `is_sequential_cpu_offload` (#7788) * up * add comment to the tests + fix dit --------- Co-authored-by: Sayak Paul --- .../community/pipeline_demofusion_sdxl.py | 6 +- src/diffusers/loaders/lora.py | 6 +- src/diffusers/loaders/textual_inversion.py | 6 +- src/diffusers/loaders/unet.py | 6 +- src/diffusers/pipelines/dit/pipeline_dit.py | 3 + src/diffusers/pipelines/pipeline_utils.py | 9 +- tests/pipelines/pixart_alpha/test_pixart.py | 4 - tests/pipelines/pixart_sigma/test_pixart.py | 4 - tests/pipelines/test_pipelines_common.py | 103 ++++++++++++++++-- 9 files changed, 123 insertions(+), 24 deletions(-) diff --git a/examples/community/pipeline_demofusion_sdxl.py b/examples/community/pipeline_demofusion_sdxl.py index 93e1463638f0..f46d635dae2b 100644 --- a/examples/community/pipeline_demofusion_sdxl.py +++ b/examples/community/pipeline_demofusion_sdxl.py @@ -1304,7 +1304,11 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di if isinstance(component, torch.nn.Module): if hasattr(component, "_hf_hook"): is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload) - is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook) + is_sequential_cpu_offload = ( + isinstance(getattr(component, "_hf_hook"), AlignDevicesHook) + or hasattr(component._hf_hook, "hooks") + and isinstance(component._hf_hook.hooks[0], AlignDevicesHook) + ) logger.info( "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again." ) diff --git a/src/diffusers/loaders/lora.py b/src/diffusers/loaders/lora.py index 8703cdee4011..d69db5a83af1 100644 --- a/src/diffusers/loaders/lora.py +++ b/src/diffusers/loaders/lora.py @@ -369,7 +369,11 @@ def _optionally_disable_offloading(cls, _pipeline): if not is_model_cpu_offload: is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload) if not is_sequential_cpu_offload: - is_sequential_cpu_offload = isinstance(component._hf_hook, AlignDevicesHook) + is_sequential_cpu_offload = ( + isinstance(component._hf_hook, AlignDevicesHook) + or hasattr(component._hf_hook, "hooks") + and isinstance(component._hf_hook.hooks[0], AlignDevicesHook) + ) logger.info( "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again." diff --git a/src/diffusers/loaders/textual_inversion.py b/src/diffusers/loaders/textual_inversion.py index c1c224975cb8..05ed64f5dcad 100644 --- a/src/diffusers/loaders/textual_inversion.py +++ b/src/diffusers/loaders/textual_inversion.py @@ -423,7 +423,11 @@ def load_textual_inversion( if isinstance(component, nn.Module): if hasattr(component, "_hf_hook"): is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload) - is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook) + is_sequential_cpu_offload = ( + isinstance(getattr(component, "_hf_hook"), AlignDevicesHook) + or hasattr(component._hf_hook, "hooks") + and isinstance(component._hf_hook.hooks[0], AlignDevicesHook) + ) logger.info( "Accelerate hooks detected. Since you have called `load_textual_inversion()`, the previous hooks will be first removed. Then the textual inversion parameters will be loaded and the hooks will be applied again." ) diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py index 294db44ee61d..3e74411865a3 100644 --- a/src/diffusers/loaders/unet.py +++ b/src/diffusers/loaders/unet.py @@ -359,7 +359,11 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict for _, component in _pipeline.components.items(): if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"): is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload) - is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook) + is_sequential_cpu_offload = ( + isinstance(getattr(component, "_hf_hook"), AlignDevicesHook) + or hasattr(component._hf_hook, "hooks") + and isinstance(component._hf_hook.hooks[0], AlignDevicesHook) + ) logger.info( "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again." diff --git a/src/diffusers/pipelines/dit/pipeline_dit.py b/src/diffusers/pipelines/dit/pipeline_dit.py index 289ea496028d..a3ea90874a12 100644 --- a/src/diffusers/pipelines/dit/pipeline_dit.py +++ b/src/diffusers/pipelines/dit/pipeline_dit.py @@ -227,6 +227,9 @@ def __call__( if output_type == "pil": samples = self.numpy_to_pil(samples) + # Offload all models + self.maybe_free_model_hooks() + if not return_dict: return (samples,) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 68433332546b..59e38c910d4a 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -376,7 +376,11 @@ def module_is_sequentially_offloaded(module): if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"): return False - return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, accelerate.hooks.AlignDevicesHook) + return hasattr(module, "_hf_hook") and ( + isinstance(module._hf_hook, accelerate.hooks.AlignDevicesHook) + or hasattr(module._hf_hook, "hooks") + and isinstance(module._hf_hook.hooks[0], accelerate.hooks.AlignDevicesHook) + ) def module_is_offloaded(module): if not is_accelerate_available() or is_accelerate_version("<", "0.17.0.dev0"): @@ -1005,8 +1009,7 @@ def remove_all_hooks(self): """ for _, model in self.components.items(): if isinstance(model, torch.nn.Module) and hasattr(model, "_hf_hook"): - is_sequential_cpu_offload = isinstance(getattr(model, "_hf_hook"), accelerate.hooks.AlignDevicesHook) - accelerate.hooks.remove_hook_from_module(model, recurse=is_sequential_cpu_offload) + accelerate.hooks.remove_hook_from_module(model, recurse=True) self._all_hooks = [] def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"): diff --git a/tests/pipelines/pixart_alpha/test_pixart.py b/tests/pipelines/pixart_alpha/test_pixart.py index d981b55260c7..dd358af08395 100644 --- a/tests/pipelines/pixart_alpha/test_pixart.py +++ b/tests/pipelines/pixart_alpha/test_pixart.py @@ -324,10 +324,6 @@ def test_raises_warning_for_mask_feature(self): def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical(expected_max_diff=1e-3) - # PixArt transformer model does not work with sequential offload so skip it for now - def test_sequential_offload_forward_pass_twice(self): - pass - @slow @require_torch_gpu diff --git a/tests/pipelines/pixart_sigma/test_pixart.py b/tests/pipelines/pixart_sigma/test_pixart.py index 7b1d5e389f32..c0df15ae661d 100644 --- a/tests/pipelines/pixart_sigma/test_pixart.py +++ b/tests/pipelines/pixart_sigma/test_pixart.py @@ -308,10 +308,6 @@ def test_inference_with_multiple_images_per_prompt(self): def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical(expected_max_diff=1e-3) - # PixArt transformer model does not work with sequential offload so skip it for now - def test_sequential_offload_forward_pass_twice(self): - pass - @slow @require_torch_gpu diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 0c0a765f662d..032fbb81ea31 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -1360,6 +1360,8 @@ def _test_attention_slicing_forward_pass( reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher", ) def test_sequential_cpu_offload_forward_pass(self, expected_max_diff=1e-4): + import accelerate + components = self.get_dummy_components() pipe = self.pipeline_class(**components) for component in pipe.components.values(): @@ -1373,6 +1375,7 @@ def test_sequential_cpu_offload_forward_pass(self, expected_max_diff=1e-4): output_without_offload = pipe(**inputs)[0] pipe.enable_sequential_cpu_offload() + assert pipe._execution_device.type == pipe._offload_device.type inputs = self.get_dummy_inputs(generator_device) output_with_offload = pipe(**inputs)[0] @@ -1380,11 +1383,48 @@ def test_sequential_cpu_offload_forward_pass(self, expected_max_diff=1e-4): max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max() self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results") + # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly + offloaded_modules = { + k: v + for k, v in pipe.components.items() + if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload + } + # 1. all offloaded modules should be saved to cpu and moved to meta device + self.assertTrue( + all(v.device.type == "meta" for v in offloaded_modules.values()), + f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'meta']}", + ) + # 2. all offloaded modules should have hook installed + self.assertTrue( + all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()), + f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}", + ) + # 3. all offloaded modules should have correct hooks installed, should be either one of these two + # - `AlignDevicesHook` + # - a SequentialHook` that contains `AlignDevicesHook` + offloaded_modules_with_incorrect_hooks = {} + for k, v in offloaded_modules.items(): + if hasattr(v, "_hf_hook"): + if isinstance(v._hf_hook, accelerate.hooks.SequentialHook): + # if it is a `SequentialHook`, we loop through its `hooks` attribute to check if it only contains `AlignDevicesHook` + for hook in v._hf_hook.hooks: + if not isinstance(hook, accelerate.hooks.AlignDevicesHook): + offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook.hooks[0]) + elif not isinstance(v._hf_hook, accelerate.hooks.AlignDevicesHook): + offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook) + + self.assertTrue( + len(offloaded_modules_with_incorrect_hooks) == 0, + f"Not installed correct hook: {offloaded_modules_with_incorrect_hooks}", + ) + @unittest.skipIf( torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.17.0"), reason="CPU offload is only available with CUDA and `accelerate v0.17.0` or higher", ) def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4): + import accelerate + generator_device = "cpu" components = self.get_dummy_components() pipe = self.pipeline_class(**components) @@ -1400,19 +1440,39 @@ def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4): output_without_offload = pipe(**inputs)[0] pipe.enable_model_cpu_offload() + assert pipe._execution_device.type == pipe._offload_device.type + inputs = self.get_dummy_inputs(generator_device) output_with_offload = pipe(**inputs)[0] max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max() self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results") - offloaded_modules = [ - v + + # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly + offloaded_modules = { + k: v for k, v in pipe.components.items() if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload - ] - ( - self.assertTrue(all(v.device.type == "cpu" for v in offloaded_modules)), - f"Not offloaded: {[v for v in offloaded_modules if v.device.type != 'cpu']}", + } + # 1. check if all offloaded modules are saved to cpu + self.assertTrue( + all(v.device.type == "cpu" for v in offloaded_modules.values()), + f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'cpu']}", + ) + # 2. check if all offloaded modules have hooks installed + self.assertTrue( + all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()), + f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}", + ) + # 3. check if all offloaded modules have correct type of hooks installed, should be `CpuOffload` + offloaded_modules_with_incorrect_hooks = {} + for k, v in offloaded_modules.items(): + if hasattr(v, "_hf_hook") and not isinstance(v._hf_hook, accelerate.hooks.CpuOffload): + offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook) + + self.assertTrue( + len(offloaded_modules_with_incorrect_hooks) == 0, + f"Not installed correct hook: {offloaded_modules_with_incorrect_hooks}", ) @unittest.skipIf( @@ -1444,16 +1504,24 @@ def test_cpu_offload_forward_pass_twice(self, expected_max_diff=2e-4): self.assertLess( max_diff, expected_max_diff, "running CPU offloading 2nd time should not affect the inference results" ) + + # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly offloaded_modules = { k: v for k, v in pipe.components.items() if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload } + # 1. check if all offloaded modules are saved to cpu self.assertTrue( all(v.device.type == "cpu" for v in offloaded_modules.values()), f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'cpu']}", ) - + # 2. check if all offloaded modules have hooks installed + self.assertTrue( + all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()), + f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}", + ) + # 3. check if all offloaded modules have correct type of hooks installed, should be `CpuOffload` offloaded_modules_with_incorrect_hooks = {} for k, v in offloaded_modules.items(): if hasattr(v, "_hf_hook") and not isinstance(v._hf_hook, accelerate.hooks.CpuOffload): @@ -1493,19 +1561,36 @@ def test_sequential_offload_forward_pass_twice(self, expected_max_diff=2e-4): self.assertLess( max_diff, expected_max_diff, "running sequential offloading second time should have the inference results" ) + + # make sure all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are offloaded correctly offloaded_modules = { k: v for k, v in pipe.components.items() if isinstance(v, torch.nn.Module) and k not in pipe._exclude_from_cpu_offload } + # 1. check if all offloaded modules are moved to meta device self.assertTrue( all(v.device.type == "meta" for v in offloaded_modules.values()), f"Not offloaded: {[k for k, v in offloaded_modules.items() if v.device.type != 'meta']}", ) + # 2. check if all offloaded modules have hook installed + self.assertTrue( + all(hasattr(v, "_hf_hook") for k, v in offloaded_modules.items()), + f"No hook attached: {[k for k, v in offloaded_modules.items() if not hasattr(v, '_hf_hook')]}", + ) + # 3. check if all offloaded modules have correct hooks installed, should be either one of these two + # - `AlignDevicesHook` + # - a SequentialHook` that contains `AlignDevicesHook` offloaded_modules_with_incorrect_hooks = {} for k, v in offloaded_modules.items(): - if hasattr(v, "_hf_hook") and not isinstance(v._hf_hook, accelerate.hooks.AlignDevicesHook): - offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook) + if hasattr(v, "_hf_hook"): + if isinstance(v._hf_hook, accelerate.hooks.SequentialHook): + # if it is a `SequentialHook`, we loop through its `hooks` attribute to check if it only contains `AlignDevicesHook` + for hook in v._hf_hook.hooks: + if not isinstance(hook, accelerate.hooks.AlignDevicesHook): + offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook.hooks[0]) + elif not isinstance(v._hf_hook, accelerate.hooks.AlignDevicesHook): + offloaded_modules_with_incorrect_hooks[k] = type(v._hf_hook) self.assertTrue( len(offloaded_modules_with_incorrect_hooks) == 0, From 5915c2985db162278e09196160d796166c89ad12 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Wed, 1 May 2024 06:27:43 -1000 Subject: [PATCH 31/56] [ip-adapter] fix ip-adapter for StableDiffusionInstructPix2PixPipeline (#7820) update prepare_ip_adapter_ for pix2pix --- ...eline_stable_diffusion_instruct_pix2pix.py | 95 +++++++++++++++++-- ...ne_stable_diffusion_xl_instruct_pix2pix.py | 1 - 2 files changed, 87 insertions(+), 9 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index de2767e23952..0bf5a92a4fcc 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -172,6 +172,7 @@ def __call__( prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, @@ -296,6 +297,8 @@ def __call__( negative_prompt, prompt_embeds, negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, callback_on_step_end_tensor_inputs, ) self._guidance_scale = guidance_scale @@ -303,14 +306,6 @@ def __call__( device = self._execution_device - if ip_adapter_image is not None: - output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True - image_embeds, negative_image_embeds = self.encode_image( - ip_adapter_image, device, num_images_per_prompt, output_hidden_state - ) - if self.do_classifier_free_guidance: - image_embeds = torch.cat([image_embeds, negative_image_embeds, negative_image_embeds]) - if image is None: raise ValueError("`image` input cannot be undefined.") @@ -335,6 +330,14 @@ def __call__( negative_prompt_embeds=negative_prompt_embeds, ) + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, + ) # 3. Preprocess image image = self.image_processor.preprocess(image) @@ -635,6 +638,65 @@ def encode_image(self, image, device, num_images_per_prompt, output_hidden_state return image_embeds, uncond_image_embeds + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] + + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) + + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) + + if do_classifier_free_guidance: + single_image_embeds = torch.cat( + [single_image_embeds, single_negative_image_embeds, single_negative_image_embeds] + ) + single_image_embeds = single_image_embeds.to(device) + + image_embeds.append(single_image_embeds) + else: + repeat_dims = [1] + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + ( + single_image_embeds, + single_negative_image_embeds, + single_negative_image_embeds, + ) = single_image_embeds.chunk(3) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) + single_image_embeds = torch.cat( + [single_image_embeds, single_negative_image_embeds, single_negative_image_embeds] + ) + else: + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + image_embeds.append(single_image_embeds) + + return image_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): if self.safety_checker is None: @@ -687,6 +749,8 @@ def check_inputs( negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, callback_on_step_end_tensor_inputs=None, ): if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): @@ -728,6 +792,21 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): shape = ( diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py index a2242bb099c5..5e7be370be01 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py @@ -436,7 +436,6 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.check_inputs def check_inputs( self, prompt, From 435d37ce5acb01f446ebb6fdd274915bc2f27bc8 Mon Sep 17 00:00:00 2001 From: Aritra Roy Gosthipaty Date: Thu, 2 May 2024 06:03:52 +0530 Subject: [PATCH 32/56] [Tests] reduce the model size in the audioldm fast test (#7833) chore: initial size reduction of models --- tests/pipelines/audioldm/test_audioldm.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py index f83dc8158e83..eddab54a3c03 100644 --- a/tests/pipelines/audioldm/test_audioldm.py +++ b/tests/pipelines/audioldm/test_audioldm.py @@ -66,16 +66,17 @@ class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase): def get_dummy_components(self): torch.manual_seed(0) unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, + block_out_channels=(8, 16), + layers_per_block=1, + norm_num_groups=8, sample_size=32, in_channels=4, out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=(32, 64), + cross_attention_dim=(8, 16), class_embed_type="simple_projection", - projection_class_embeddings_input_dim=32, + projection_class_embeddings_input_dim=8, class_embeddings_concat=True, ) scheduler = DDIMScheduler( @@ -87,9 +88,10 @@ def get_dummy_components(self): ) torch.manual_seed(0) vae = AutoencoderKL( - block_out_channels=[32, 64], + block_out_channels=[8, 16], in_channels=1, out_channels=1, + norm_num_groups=8, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], latent_channels=4, @@ -98,14 +100,14 @@ def get_dummy_components(self): text_encoder_config = ClapTextConfig( bos_token_id=0, eos_token_id=2, - hidden_size=32, + hidden_size=8, intermediate_size=37, layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, + num_attention_heads=1, + num_hidden_layers=1, pad_token_id=1, vocab_size=1000, - projection_dim=32, + projection_dim=8, ) text_encoder = ClapTextModelWithProjection(text_encoder_config) tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77) From c1b2a89e349b3b632c1f32052331e549516f87fc Mon Sep 17 00:00:00 2001 From: yunseong Cho Date: Thu, 2 May 2024 13:59:55 +0900 Subject: [PATCH 33/56] Fix key error for dictionary with randomized order in convert_ldm_unet_checkpoint (#7680) fix key error for different order Co-authored-by: yunseong Co-authored-by: Dhruv Nair --- src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index f04a21ef4857..b4b1b885dd3c 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -557,7 +557,7 @@ def convert_ldm_unet_checkpoint( paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config ) - output_block_list = {k: sorted(v) for k, v in output_block_list.items()} + output_block_list = {k: sorted(v) for k, v in sorted(output_block_list.items())} if ["conv.bias", "conv.weight"] in output_block_list.values(): index = list(output_block_list.values()).index(["conv.bias", "conv.weight"]) new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ From 3ffa7b46e5d896dc35264b50325460f554556a93 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 2 May 2024 13:08:57 +0530 Subject: [PATCH 34/56] Fix hanging pipeline fetching (#7837) update --- .github/workflows/push_tests.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index d5e1c4739497..573802081704 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -21,7 +21,9 @@ env: jobs: setup_torch_cuda_pipeline_matrix: name: Setup Torch Pipelines CUDA Slow Tests Matrix - runs-on: diffusers/diffusers-pytorch-cpu + runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ] + container: + image: diffusers/diffusers-pytorch-cpu outputs: pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }} steps: From 03ca11318e61de5f41c4406ac455d926b32e5714 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 2 May 2024 13:15:38 +0530 Subject: [PATCH 35/56] Update download diff format tests (#7831) update Co-authored-by: Sayak Paul --- .../stable_diffusion/test_stable_diffusion.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 145e0012f8e9..137f1e696d93 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -1257,8 +1257,8 @@ def tearDown(self): def test_download_from_hub(self): ckpt_paths = [ - "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt", - "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix_base.ckpt", + "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors", + "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors", ] for ckpt_path in ckpt_paths: @@ -1271,7 +1271,7 @@ def test_download_from_hub(self): assert image_out.shape == (512, 512, 3) def test_download_local(self): - ckpt_filename = hf_hub_download("runwayml/stable-diffusion-v1-5", filename="v1-5-pruned-emaonly.ckpt") + ckpt_filename = hf_hub_download("runwayml/stable-diffusion-v1-5", filename="v1-5-pruned-emaonly.safetensors") config_filename = hf_hub_download("runwayml/stable-diffusion-v1-5", filename="v1-inference.yaml") pipe = StableDiffusionPipeline.from_single_file( @@ -1285,7 +1285,7 @@ def test_download_local(self): assert image_out.shape == (512, 512, 3) def test_download_ckpt_diff_format_is_same(self): - ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt" + ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors" sf_pipe = StableDiffusionPipeline.from_single_file(ckpt_path) sf_pipe.scheduler = DDIMScheduler.from_config(sf_pipe.scheduler.config) @@ -1310,7 +1310,7 @@ def test_download_ckpt_diff_format_is_same(self): def test_single_file_component_configs(self): pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") - ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt" + ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors" single_file_pipe = StableDiffusionPipeline.from_single_file(ckpt_path, load_safety_checker=True) for param_name, param_value in single_file_pipe.text_encoder.config.to_dict().items(): From 3c85a57297b22df8921bae39c0a2e3982ee69de7 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 2 May 2024 14:03:35 +0530 Subject: [PATCH 36/56] Update CI cache (#7832) update Co-authored-by: Sayak Paul --- .github/workflows/push_tests.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index 573802081704..b90ac8da2d69 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -62,7 +62,7 @@ jobs: runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-cuda - options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 --privileged + options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0 --privileged steps: - name: Checkout diffusers uses: actions/checkout@v3 @@ -119,7 +119,7 @@ jobs: runs-on: docker-gpu container: image: diffusers/diffusers-pytorch-cuda - options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 + options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0 defaults: run: shell: bash @@ -171,7 +171,7 @@ jobs: runs-on: docker-gpu container: image: diffusers/diffusers-pytorch-cuda - options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 + options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0 defaults: run: shell: bash @@ -221,7 +221,7 @@ jobs: runs-on: docker-tpu container: image: diffusers/diffusers-flax-tpu - options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged + options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --privileged defaults: run: shell: bash @@ -268,7 +268,7 @@ jobs: runs-on: docker-gpu container: image: diffusers/diffusers-onnxruntime-cuda - options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 + options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0 defaults: run: shell: bash @@ -317,7 +317,7 @@ jobs: container: image: diffusers/diffusers-pytorch-compile-cuda - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Checkout diffusers @@ -358,7 +358,7 @@ jobs: container: image: diffusers/diffusers-pytorch-xformers-cuda - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Checkout diffusers @@ -399,7 +399,7 @@ jobs: container: image: diffusers/diffusers-pytorch-cuda - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Checkout diffusers From 44ba90caff5355f1686a7bb79cacbf0a3b27eba5 Mon Sep 17 00:00:00 2001 From: Guillaume LEGENDRE Date: Thu, 2 May 2024 14:53:38 +0200 Subject: [PATCH 37/56] move to new runners (#7839) --- .github/workflows/pr_test_fetcher.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pr_test_fetcher.yml b/.github/workflows/pr_test_fetcher.yml index 4dbb118c6092..7bbaaf2240a2 100644 --- a/.github/workflows/pr_test_fetcher.yml +++ b/.github/workflows/pr_test_fetcher.yml @@ -15,7 +15,7 @@ concurrency: jobs: setup_pr_tests: name: Setup PR Tests - runs-on: docker-cpu + runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ] container: image: diffusers/diffusers-pytorch-cpu options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ @@ -73,7 +73,7 @@ jobs: max-parallel: 2 matrix: modules: ${{ fromJson(needs.setup_pr_tests.outputs.matrix) }} - runs-on: docker-cpu + runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ] container: image: diffusers/diffusers-pytorch-cpu options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ @@ -123,7 +123,7 @@ jobs: config: - name: Hub tests for models, schedulers, and pipelines framework: hub_tests_pytorch - runner: docker-cpu + runner: [ self-hosted, intel-cpu, 8-cpu, ci ] image: diffusers/diffusers-pytorch-cpu report: torch_hub From ce97d7e19b34b999f6bcc123a1437d283b4813aa Mon Sep 17 00:00:00 2001 From: Guillaume LEGENDRE Date: Thu, 2 May 2024 15:18:46 +0200 Subject: [PATCH 38/56] Change GPU Runners (#7840) * Move to new GPU Runners for slow tests * Move to new GPU Runners for nightly tests --- .github/workflows/nightly_tests.yml | 6 +++--- .github/workflows/push_tests.yml | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml index d911dab4a306..2e9ac33d6b00 100644 --- a/.github/workflows/nightly_tests.yml +++ b/.github/workflows/nightly_tests.yml @@ -112,7 +112,7 @@ jobs: run_nightly_tests_for_other_torch_modules: name: Torch Non-Pipelines CUDA Nightly Tests - runs-on: docker-gpu + runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-cuda options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 @@ -185,7 +185,7 @@ jobs: run_lora_nightly_tests: name: Nightly LoRA Tests with PEFT and TORCH - runs-on: docker-gpu + runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-cuda options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0 @@ -298,7 +298,7 @@ jobs: run_nightly_onnx_tests: name: Nightly ONNXRuntime CUDA tests on Ubuntu - runs-on: docker-gpu + runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-onnxruntime-cuda options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index b90ac8da2d69..ae1dd537408f 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -116,7 +116,7 @@ jobs: torch_cuda_tests: name: Torch CUDA Tests - runs-on: docker-gpu + runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-cuda options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0 @@ -168,7 +168,7 @@ jobs: peft_cuda_tests: name: PEFT CUDA Tests - runs-on: docker-gpu + runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-cuda options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0 @@ -265,7 +265,7 @@ jobs: onnx_cuda_tests: name: ONNX CUDA Tests - runs-on: docker-gpu + runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-onnxruntime-cuda options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0 @@ -313,7 +313,7 @@ jobs: run_torch_compile_tests: name: PyTorch Compile CUDA tests - runs-on: docker-gpu + runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-compile-cuda @@ -354,7 +354,7 @@ jobs: run_xformers_tests: name: PyTorch xformers CUDA tests - runs-on: docker-gpu + runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-xformers-cuda @@ -395,7 +395,7 @@ jobs: run_examples_tests: name: Examples PyTorch CUDA tests on Ubuntu - runs-on: docker-gpu + runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-cuda From 0d7c4790235ac00b4524b492bc2a680dcc5cf6b0 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 2 May 2024 20:36:47 +0530 Subject: [PATCH 39/56] Update deps for pipe test fetcher (#7838) update Co-authored-by: Sayak Paul --- .github/workflows/push_tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index ae1dd537408f..163069be0912 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -37,8 +37,8 @@ jobs: python-version: "3.8" - name: Install dependencies run: | - pip install -e . - pip install huggingface_hub + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test] - name: Fetch Pipeline Matrix id: fetch_pipeline_matrix run: | From fa489eaed6b1812c1a1b604bb5c11ea861523f45 Mon Sep 17 00:00:00 2001 From: Aritra Roy Gosthipaty Date: Fri, 3 May 2024 07:46:48 +0530 Subject: [PATCH 40/56] [Tests] reduce the model size in the blipdiffusion fast test (#7849) reducing model size --- .../blipdiffusion/test_blipdiffusion.py | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/tests/pipelines/blipdiffusion/test_blipdiffusion.py b/tests/pipelines/blipdiffusion/test_blipdiffusion.py index c5eaa3883d09..7e85cef65129 100644 --- a/tests/pipelines/blipdiffusion/test_blipdiffusion.py +++ b/tests/pipelines/blipdiffusion/test_blipdiffusion.py @@ -64,9 +64,9 @@ def get_dummy_components(self): torch.manual_seed(0) text_encoder_config = CLIPTextConfig( vocab_size=1000, - hidden_size=16, - intermediate_size=16, - projection_dim=16, + hidden_size=8, + intermediate_size=8, + projection_dim=8, num_hidden_layers=1, num_attention_heads=1, max_position_embeddings=77, @@ -78,17 +78,17 @@ def get_dummy_components(self): out_channels=4, down_block_types=("DownEncoderBlock2D",), up_block_types=("UpDecoderBlock2D",), - block_out_channels=(32,), + block_out_channels=(8,), + norm_num_groups=8, layers_per_block=1, act_fn="silu", latent_channels=4, - norm_num_groups=16, - sample_size=16, + sample_size=8, ) blip_vision_config = { - "hidden_size": 16, - "intermediate_size": 16, + "hidden_size": 8, + "intermediate_size": 8, "num_hidden_layers": 1, "num_attention_heads": 1, "image_size": 224, @@ -98,32 +98,32 @@ def get_dummy_components(self): blip_qformer_config = { "vocab_size": 1000, - "hidden_size": 16, + "hidden_size": 8, "num_hidden_layers": 1, "num_attention_heads": 1, - "intermediate_size": 16, + "intermediate_size": 8, "max_position_embeddings": 512, "cross_attention_frequency": 1, - "encoder_hidden_size": 16, + "encoder_hidden_size": 8, } qformer_config = Blip2Config( vision_config=blip_vision_config, qformer_config=blip_qformer_config, - num_query_tokens=16, + num_query_tokens=8, tokenizer="hf-internal-testing/tiny-random-bert", ) qformer = Blip2QFormerModel(qformer_config) unet = UNet2DConditionModel( - block_out_channels=(16, 32), - norm_num_groups=16, + block_out_channels=(8, 16), + norm_num_groups=8, layers_per_block=1, sample_size=16, in_channels=4, out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=16, + cross_attention_dim=8, ) tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") @@ -189,7 +189,9 @@ def test_blipdiffusion(self): assert image.shape == (1, 16, 16, 4) - expected_slice = np.array([0.7096, 0.5900, 0.6703, 0.4032, 0.7766, 0.3629, 0.5447, 0.4149, 0.8172]) + expected_slice = np.array( + [0.5329548, 0.8372512, 0.33269387, 0.82096875, 0.43657133, 0.3783, 0.5953028, 0.51934963, 0.42142007] + ) assert ( np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 From 6a479588db5201c1f4dfc4c01350f5c91d8d8c0d Mon Sep 17 00:00:00 2001 From: Lucain Date: Fri, 3 May 2024 08:42:57 +0200 Subject: [PATCH 41/56] Respect `resume_download` deprecation (#7843) * Deprecate resume_download * align docstring with transformers * style --------- Co-authored-by: Sayak Paul --- src/diffusers/configuration_utils.py | 8 +++---- src/diffusers/loaders/autoencoder.py | 8 +++---- src/diffusers/loaders/controlnet.py | 8 +++---- src/diffusers/loaders/ip_adapter.py | 8 +++---- src/diffusers/loaders/lora.py | 8 +++---- src/diffusers/loaders/single_file.py | 8 +++---- src/diffusers/loaders/single_file_utils.py | 2 +- src/diffusers/loaders/textual_inversion.py | 8 +++---- src/diffusers/loaders/unet.py | 16 ++++++------- src/diffusers/models/modeling_flax_utils.py | 8 +++---- src/diffusers/models/modeling_utils.py | 8 +++---- src/diffusers/pipelines/auto_pipeline.py | 24 +++++++++---------- .../pipelines/pipeline_flax_utils.py | 8 +++---- .../pipelines/pipeline_loading_utils.py | 4 ++-- src/diffusers/pipelines/pipeline_utils.py | 16 ++++++------- src/diffusers/schedulers/scheduling_utils.py | 6 ++--- .../schedulers/scheduling_utils_flax.py | 6 ++--- src/diffusers/utils/dynamic_modules_utils.py | 15 ++++++------ src/diffusers/utils/hub_utils.py | 2 +- 19 files changed, 86 insertions(+), 85 deletions(-) diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py index 8f4a1958975d..7d76687a3d1e 100644 --- a/src/diffusers/configuration_utils.py +++ b/src/diffusers/configuration_utils.py @@ -310,9 +310,9 @@ def load_config( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -341,7 +341,7 @@ def load_config( """ cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) token = kwargs.pop("token", None) local_files_only = kwargs.pop("local_files_only", False) diff --git a/src/diffusers/loaders/autoencoder.py b/src/diffusers/loaders/autoencoder.py index b91d27f7d63e..36b022a26ec9 100644 --- a/src/diffusers/loaders/autoencoder.py +++ b/src/diffusers/loaders/autoencoder.py @@ -50,9 +50,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -99,7 +99,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): original_config_file = kwargs.pop("original_config_file", None) config_file = kwargs.pop("config_file", None) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) force_download = kwargs.pop("force_download", False) proxies = kwargs.pop("proxies", None) token = kwargs.pop("token", None) diff --git a/src/diffusers/loaders/controlnet.py b/src/diffusers/loaders/controlnet.py index d323f60aa7ee..53b9802d390e 100644 --- a/src/diffusers/loaders/controlnet.py +++ b/src/diffusers/loaders/controlnet.py @@ -50,9 +50,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -89,7 +89,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): """ original_config_file = kwargs.pop("original_config_file", None) config_file = kwargs.pop("config_file", None) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) force_download = kwargs.pop("force_download", False) proxies = kwargs.pop("proxies", None) token = kwargs.pop("token", None) diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index 28a4334b199c..ef6a53e43196 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -90,9 +90,9 @@ def load_ip_adapter( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -135,7 +135,7 @@ def load_ip_adapter( # Load the main state dict first. cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", None) token = kwargs.pop("token", None) diff --git a/src/diffusers/loaders/lora.py b/src/diffusers/loaders/lora.py index d69db5a83af1..2f525986a096 100644 --- a/src/diffusers/loaders/lora.py +++ b/src/diffusers/loaders/lora.py @@ -176,9 +176,9 @@ def lora_state_dict( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -208,7 +208,7 @@ def lora_state_dict( # UNet and text encoder or both. cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", None) token = kwargs.pop("token", None) diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py index 752ef18c7a0b..d8ff92d0b0ff 100644 --- a/src/diffusers/loaders/single_file.py +++ b/src/diffusers/loaders/single_file.py @@ -177,9 +177,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -244,7 +244,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): ``` """ original_config_file = kwargs.pop("original_config_file", None) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) force_download = kwargs.pop("force_download", False) proxies = kwargs.pop("proxies", None) token = kwargs.pop("token", None) diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index 5b01b8da2b1f..c23e594c3976 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -305,7 +305,7 @@ def fetch_ldm_config_and_checkpoint( pretrained_model_link_or_path, class_name, original_config_file=None, - resume_download=False, + resume_download=None, force_download=False, proxies=None, token=None, diff --git a/src/diffusers/loaders/textual_inversion.py b/src/diffusers/loaders/textual_inversion.py index 05ed64f5dcad..a9b9a9aae052 100644 --- a/src/diffusers/loaders/textual_inversion.py +++ b/src/diffusers/loaders/textual_inversion.py @@ -38,7 +38,7 @@ def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs): cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", None) token = kwargs.pop("token", None) @@ -308,9 +308,9 @@ def load_textual_inversion( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py index 3e74411865a3..5d5ed30dc35f 100644 --- a/src/diffusers/loaders/unet.py +++ b/src/diffusers/loaders/unet.py @@ -103,9 +103,9 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -149,7 +149,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", None) token = kwargs.pop("token", None) @@ -1090,9 +1090,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -1114,7 +1114,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs): raise ValueError("FromOriginalUNetMixin is currently only compatible with StableCascadeUNet") config = kwargs.pop("config", None) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) force_download = kwargs.pop("force_download", False) proxies = kwargs.pop("proxies", None) token = kwargs.pop("token", None) diff --git a/src/diffusers/models/modeling_flax_utils.py b/src/diffusers/models/modeling_flax_utils.py index 1ddcda9005fc..151281070faa 100644 --- a/src/diffusers/models/modeling_flax_utils.py +++ b/src/diffusers/models/modeling_flax_utils.py @@ -245,9 +245,9 @@ def from_pretrained( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -296,7 +296,7 @@ def from_pretrained( cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) from_pt = kwargs.pop("from_pt", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) token = kwargs.pop("token", None) diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index 8d9f2d9e71fc..a8518ca3ff7f 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -446,9 +446,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -530,7 +530,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False) force_download = kwargs.pop("force_download", False) from_flax = kwargs.pop("from_flax", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) output_loading_info = kwargs.pop("output_loading_info", False) local_files_only = kwargs.pop("local_files_only", None) diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py index c8b682e8afe4..5fb497ef2e22 100644 --- a/src/diffusers/pipelines/auto_pipeline.py +++ b/src/diffusers/pipelines/auto_pipeline.py @@ -234,9 +234,9 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -311,7 +311,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): """ cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) token = kwargs.pop("token", None) local_files_only = kwargs.pop("local_files_only", False) @@ -507,9 +507,9 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -584,7 +584,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): """ cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) token = kwargs.pop("token", None) local_files_only = kwargs.pop("local_files_only", False) @@ -783,9 +783,9 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -860,7 +860,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): """ cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) token = kwargs.pop("token", None) local_files_only = kwargs.pop("local_files_only", False) diff --git a/src/diffusers/pipelines/pipeline_flax_utils.py b/src/diffusers/pipelines/pipeline_flax_utils.py index b1035c1f2f42..7534149b559a 100644 --- a/src/diffusers/pipelines/pipeline_flax_utils.py +++ b/src/diffusers/pipelines/pipeline_flax_utils.py @@ -254,9 +254,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -316,7 +316,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P ``` """ cache_dir = kwargs.pop("cache_dir", None) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", False) token = kwargs.pop("token", None) diff --git a/src/diffusers/pipelines/pipeline_loading_utils.py b/src/diffusers/pipelines/pipeline_loading_utils.py index 15fb34e72d24..f7d9785043d1 100644 --- a/src/diffusers/pipelines/pipeline_loading_utils.py +++ b/src/diffusers/pipelines/pipeline_loading_utils.py @@ -435,7 +435,7 @@ def _load_empty_model( return_unused_kwargs=True, return_commit_hash=True, force_download=kwargs.pop("force_download", False), - resume_download=kwargs.pop("resume_download", False), + resume_download=kwargs.pop("resume_download", None), proxies=kwargs.pop("proxies", None), local_files_only=kwargs.pop("local_files_only", False), token=kwargs.pop("token", None), @@ -454,7 +454,7 @@ def _load_empty_model( cached_folder, subfolder=name, force_download=kwargs.pop("force_download", False), - resume_download=kwargs.pop("resume_download", False), + resume_download=kwargs.pop("resume_download", None), proxies=kwargs.pop("proxies", None), local_files_only=kwargs.pop("local_files_only", False), token=kwargs.pop("token", None), diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 59e38c910d4a..e5f822caa0ef 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -533,9 +533,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P cache_dir (`Union[str, os.PathLike]`, *optional*): Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -625,7 +625,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P ``` """ cache_dir = kwargs.pop("cache_dir", None) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) force_download = kwargs.pop("force_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", None) @@ -1216,9 +1216,9 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. @@ -1271,7 +1271,7 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: """ cache_dir = kwargs.pop("cache_dir", None) - resume_download = kwargs.pop("resume_download", False) + resume_download = kwargs.pop("resume_download", None) force_download = kwargs.pop("force_download", False) proxies = kwargs.pop("proxies", None) local_files_only = kwargs.pop("local_files_only", None) diff --git a/src/diffusers/schedulers/scheduling_utils.py b/src/diffusers/schedulers/scheduling_utils.py index 5dbdb82884bc..dcdce6c51f05 100644 --- a/src/diffusers/schedulers/scheduling_utils.py +++ b/src/diffusers/schedulers/scheduling_utils.py @@ -112,9 +112,9 @@ def from_pretrained( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to `False`, any - incompletely downloaded files are deleted. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. diff --git a/src/diffusers/schedulers/scheduling_utils_flax.py b/src/diffusers/schedulers/scheduling_utils_flax.py index a1d471f910e5..360ca4705e02 100644 --- a/src/diffusers/schedulers/scheduling_utils_flax.py +++ b/src/diffusers/schedulers/scheduling_utils_flax.py @@ -102,9 +102,9 @@ def from_pretrained( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. diff --git a/src/diffusers/utils/dynamic_modules_utils.py b/src/diffusers/utils/dynamic_modules_utils.py index add95812122c..8df418f3fdd4 100644 --- a/src/diffusers/utils/dynamic_modules_utils.py +++ b/src/diffusers/utils/dynamic_modules_utils.py @@ -201,7 +201,7 @@ def get_cached_module_file( module_file: str, cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, - resume_download: bool = False, + resume_download: Optional[bool] = None, proxies: Optional[Dict[str, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, @@ -228,9 +228,9 @@ def get_cached_module_file( cache should not be used. force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the configuration files and override the cached versions if they - exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. + exist. resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 + of Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. @@ -380,7 +380,7 @@ def get_class_from_dynamic_module( class_name: Optional[str] = None, cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, - resume_download: bool = False, + resume_download: Optional[bool] = None, proxies: Optional[Dict[str, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, @@ -417,8 +417,9 @@ def get_class_from_dynamic_module( force_download (`bool`, *optional*, defaults to `False`): Whether or not to force to (re-)download the configuration files and override the cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. Will be removed in v1 of + Diffusers. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py index d70ee53aaa41..83f02848fcf4 100644 --- a/src/diffusers/utils/hub_utils.py +++ b/src/diffusers/utils/hub_utils.py @@ -283,7 +283,7 @@ def _get_model_file( cache_dir: Optional[str] = None, force_download: bool = False, proxies: Optional[Dict] = None, - resume_download: bool = False, + resume_download: Optional[bool] = None, local_files_only: bool = False, token: Optional[str] = None, user_agent: Optional[Union[Dict, str]] = None, From 3e35628873e5a1723fdbb84a8789e99f243b4858 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Fri, 3 May 2024 15:09:15 +0530 Subject: [PATCH 42/56] Remove installing python again in container (#7852) update --- .github/workflows/push_tests.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index 163069be0912..00491e54b738 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -31,14 +31,13 @@ jobs: uses: actions/checkout@v3 with: fetch-depth: 2 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.8" - name: Install dependencies run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] + - name: Environment + run: | + python utils/print_env.py - name: Fetch Pipeline Matrix id: fetch_pipeline_matrix run: | From 58237364b1780223f48a80256f56408efe7b59a0 Mon Sep 17 00:00:00 2001 From: HelloWorldBeginner <294810125@qq.com> Date: Sat, 4 May 2024 02:14:34 +0800 Subject: [PATCH 43/56] Add Ascend NPU support for SDXL fine-tuning and fix the model saving bug when using DeepSpeed. (#7816) * Add Ascend NPU support for SDXL fine-tuning and fix the model saving bug when using DeepSpeed. * fix check code quality * Decouple the NPU flash attention and make it an independent module. * add doc and unit tests for npu flash attention. --------- Co-authored-by: mhh001 Co-authored-by: Sayak Paul --- docs/source/en/api/attnprocessor.md | 3 + examples/controlnet/train_controlnet_sdxl.py | 19 ++- .../train_text_to_image_lora_sdxl.py | 19 ++- src/diffusers/models/activations.py | 14 +- src/diffusers/models/attention_processor.py | 132 +++++++++++++++++- src/diffusers/models/modeling_utils.py | 30 ++++ tests/models/test_modeling_common.py | 56 +++++++- 7 files changed, 261 insertions(+), 12 deletions(-) diff --git a/docs/source/en/api/attnprocessor.md b/docs/source/en/api/attnprocessor.md index ab89d4d260f0..f586e9b08f2c 100644 --- a/docs/source/en/api/attnprocessor.md +++ b/docs/source/en/api/attnprocessor.md @@ -55,3 +55,6 @@ An attention processor is a class for applying different types of attention mech ## XFormersAttnProcessor [[autodoc]] models.attention_processor.XFormersAttnProcessor + +## AttnProcessorNPU +[[autodoc]] models.attention_processor.AttnProcessorNPU diff --git a/examples/controlnet/train_controlnet_sdxl.py b/examples/controlnet/train_controlnet_sdxl.py index 62192521a323..288a1e3fb612 100644 --- a/examples/controlnet/train_controlnet_sdxl.py +++ b/examples/controlnet/train_controlnet_sdxl.py @@ -32,7 +32,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import ProjectConfiguration, set_seed +from accelerate.utils import DistributedType, ProjectConfiguration, set_seed from datasets import load_dataset from huggingface_hub import create_repo, upload_folder from packaging import version @@ -53,7 +53,7 @@ from diffusers.optimization import get_scheduler from diffusers.utils import check_min_version, is_wandb_available, make_image_grid from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card -from diffusers.utils.import_utils import is_xformers_available +from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available from diffusers.utils.torch_utils import is_compiled_module @@ -64,6 +64,8 @@ check_min_version("0.28.0.dev0") logger = get_logger(__name__) +if is_torch_npu_available(): + torch.npu.config.allow_internal_format = False def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step, is_final_validation=False): @@ -471,6 +473,9 @@ def parse_args(input_args=None): parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." ) + parser.add_argument( + "--enable_npu_flash_attention", action="store_true", help="Whether or not to use npu flash attention." + ) parser.add_argument( "--set_grads_to_none", action="store_true", @@ -936,6 +941,13 @@ def load_model_hook(models, input_dir): text_encoder_two.requires_grad_(False) controlnet.train() + if args.enable_npu_flash_attention: + if is_torch_npu_available(): + logger.info("npu flash attention enabled.") + unet.enable_npu_flash_attention() + else: + raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu devices.") + if args.enable_xformers_memory_efficient_attention: if is_xformers_available(): import xformers @@ -1235,7 +1247,8 @@ def compute_embeddings(batch, proportion_empty_prompts, text_encoders, tokenizer progress_bar.update(1) global_step += 1 - if accelerator.is_main_process: + # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues. + if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process: if global_step % args.checkpointing_steps == 0: # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` if args.checkpoints_total_limit is not None: diff --git a/examples/text_to_image/train_text_to_image_lora_sdxl.py b/examples/text_to_image/train_text_to_image_lora_sdxl.py index 0a6a70de2dc7..3604e755c62a 100644 --- a/examples/text_to_image/train_text_to_image_lora_sdxl.py +++ b/examples/text_to_image/train_text_to_image_lora_sdxl.py @@ -32,7 +32,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed +from accelerate.utils import DistributedDataParallelKwargs, DistributedType, ProjectConfiguration, set_seed from datasets import load_dataset from huggingface_hub import create_repo, upload_folder from packaging import version @@ -60,7 +60,7 @@ is_wandb_available, ) from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card -from diffusers.utils.import_utils import is_xformers_available +from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available from diffusers.utils.torch_utils import is_compiled_module @@ -68,6 +68,8 @@ check_min_version("0.28.0.dev0") logger = get_logger(__name__) +if is_torch_npu_available(): + torch.npu.config.allow_internal_format = False def save_model_card( @@ -419,6 +421,9 @@ def parse_args(input_args=None): parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." ) + parser.add_argument( + "--enable_npu_flash_attention", action="store_true", help="Whether or not to use npu flash attention." + ) parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.") parser.add_argument( "--rank", @@ -623,6 +628,13 @@ def main(args): text_encoder_one.to(accelerator.device, dtype=weight_dtype) text_encoder_two.to(accelerator.device, dtype=weight_dtype) + if args.enable_npu_flash_attention: + if is_torch_npu_available(): + logger.info("npu flash attention enabled.") + unet.enable_npu_flash_attention() + else: + raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu devices.") + if args.enable_xformers_memory_efficient_attention: if is_xformers_available(): import xformers @@ -1149,7 +1161,8 @@ def compute_time_ids(original_size, crops_coords_top_left): accelerator.log({"train_loss": train_loss}, step=global_step) train_loss = 0.0 - if accelerator.is_main_process: + # DeepSpeed requires saving weights on every device; saving weights only on the main process would cause issues. + if accelerator.distributed_type == DistributedType.DEEPSPEED or accelerator.is_main_process: if global_step % args.checkpointing_steps == 0: # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` if args.checkpoints_total_limit is not None: diff --git a/src/diffusers/models/activations.py b/src/diffusers/models/activations.py index cec83bdded9e..f94b6c8d6d06 100644 --- a/src/diffusers/models/activations.py +++ b/src/diffusers/models/activations.py @@ -18,8 +18,12 @@ from torch import nn from ..utils import deprecate +from ..utils.import_utils import is_torch_npu_available +if is_torch_npu_available(): + import torch_npu + ACTIVATION_FUNCTIONS = { "swish": nn.SiLU(), "silu": nn.SiLU(), @@ -98,9 +102,13 @@ def forward(self, hidden_states, *args, **kwargs): if len(args) > 0 or kwargs.get("scale", None) is not None: deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." deprecate("scale", "1.0.0", deprecation_message) - - hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1) - return hidden_states * self.gelu(gate) + hidden_states = self.proj(hidden_states) + if is_torch_npu_available(): + # using torch_npu.npu_geglu can run faster and save memory on NPU. + return torch_npu.npu_geglu(hidden_states, dim=-1, approximate=1)[0] + else: + hidden_states, gate = hidden_states.chunk(2, dim=-1) + return hidden_states * self.gelu(gate) class ApproximateGELU(nn.Module): diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 429807989296..ea1c987e95c6 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import inspect +import math from importlib import import_module from typing import Callable, List, Optional, Union @@ -21,13 +22,15 @@ from ..image_processor import IPAdapterMaskProcessor from ..utils import deprecate, logging -from ..utils.import_utils import is_xformers_available +from ..utils.import_utils import is_torch_npu_available, is_xformers_available from ..utils.torch_utils import maybe_allow_in_graph from .lora import LoRALinearLayer logger = logging.get_logger(__name__) # pylint: disable=invalid-name +if is_torch_npu_available(): + import torch_npu if is_xformers_available(): import xformers @@ -209,6 +212,23 @@ def __init__( ) self.set_processor(processor) + def set_use_npu_flash_attention(self, use_npu_flash_attention: bool) -> None: + r""" + Set whether to use npu flash attention from `torch_npu` or not. + + """ + if use_npu_flash_attention: + processor = AttnProcessorNPU() + else: + # set attention processor + # We use the AttnProcessor2_0 by default when torch 2.x is used which uses + # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention + # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1 + processor = ( + AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor() + ) + self.set_processor(processor) + def set_use_memory_efficient_attention_xformers( self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None ) -> None: @@ -1207,6 +1227,116 @@ def __call__( return hidden_states +class AttnProcessorNPU: + + r""" + Processor for implementing flash attention using torch_npu. Torch_npu supports only fp16 and bf16 data types. If + fp32 is used, F.scaled_dot_product_attention will be used for computation, but the acceleration effect on NPU is + not significant. + + """ + + def __init__(self): + if not is_torch_npu_available(): + raise ImportError("AttnProcessorNPU requires torch_npu extensions and is supported only on npu devices.") + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + temb: Optional[torch.FloatTensor] = None, + *args, + **kwargs, + ) -> torch.FloatTensor: + if len(args) > 0 or kwargs.get("scale", None) is not None: + deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." + deprecate("scale", "1.0.0", deprecation_message) + + residual = hidden_states + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + + if attention_mask is not None: + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + # scaled_dot_product_attention expects attention_mask shape to be + # (batch, heads, source_length, target_length) + attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + inner_dim = key.shape[-1] + head_dim = inner_dim // attn.heads + + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + # the output of sdp = (batch, num_heads, seq_len, head_dim) + if query.dtype in (torch.float16, torch.bfloat16): + hidden_states = torch_npu.npu_fusion_attention( + query, + key, + value, + attn.heads, + input_layout="BNSD", + pse=None, + atten_mask=attention_mask, + scale=1.0 / math.sqrt(query.shape[-1]), + pre_tockens=65536, + next_tockens=65536, + keep_prob=1.0, + sync=False, + inner_precise=0, + )[0] + else: + # TODO: add support for attn.scale when we move to Torch 2.1 + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) + + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + hidden_states = hidden_states.to(query.dtype) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + + return hidden_states + + class AttnProcessor2_0: r""" Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index a8518ca3ff7f..373f5453aa23 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -272,6 +272,36 @@ def disable_gradient_checkpointing(self) -> None: if self._supports_gradient_checkpointing: self.apply(partial(self._set_gradient_checkpointing, value=False)) + def set_use_npu_flash_attention(self, valid: bool) -> None: + r""" + Set the switch for the npu flash attention. + """ + + def fn_recursive_set_npu_flash_attention(module: torch.nn.Module): + if hasattr(module, "set_use_npu_flash_attention"): + module.set_use_npu_flash_attention(valid) + + for child in module.children(): + fn_recursive_set_npu_flash_attention(child) + + for module in self.children(): + if isinstance(module, torch.nn.Module): + fn_recursive_set_npu_flash_attention(module) + + def enable_npu_flash_attention(self) -> None: + r""" + Enable npu flash attention from torch_npu + + """ + self.set_use_npu_flash_attention(True) + + def disable_npu_flash_attention(self) -> None: + r""" + disable npu flash attention from torch_npu + + """ + self.set_use_npu_flash_attention(False) + def set_use_memory_efficient_attention_xformers( self, valid: bool, attention_op: Optional[Callable] = None ) -> None: diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index d9e70c6dd784..59369b509876 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -30,9 +30,14 @@ from requests.exceptions import HTTPError from diffusers.models import UNet2DConditionModel -from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0, XFormersAttnProcessor +from diffusers.models.attention_processor import ( + AttnProcessor, + AttnProcessor2_0, + AttnProcessorNPU, + XFormersAttnProcessor, +) from diffusers.training_utils import EMAModel -from diffusers.utils import is_xformers_available, logging +from diffusers.utils import is_torch_npu_available, is_xformers_available, logging from diffusers.utils.testing_utils import ( CaptureLogger, get_python_version, @@ -300,6 +305,53 @@ def test_getattr_is_correct(self): assert str(error.exception) == f"'{type(model).__name__}' object has no attribute 'does_not_exist'" + @unittest.skipIf( + torch_device != "npu" or not is_torch_npu_available(), + reason="torch npu flash attention is only available with NPU and `torch_npu` installed", + ) + def test_set_torch_npu_flash_attn_processor_determinism(self): + torch.use_deterministic_algorithms(False) + if self.forward_requires_fresh_args: + model = self.model_class(**self.init_dict) + else: + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + model = self.model_class(**init_dict) + model.to(torch_device) + + if not hasattr(model, "set_attn_processor"): + # If not has `set_attn_processor`, skip test + return + + model.set_default_attn_processor() + assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values()) + with torch.no_grad(): + if self.forward_requires_fresh_args: + output = model(**self.inputs_dict(0))[0] + else: + output = model(**inputs_dict)[0] + + model.enable_npu_flash_attention() + assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values()) + with torch.no_grad(): + if self.forward_requires_fresh_args: + output_2 = model(**self.inputs_dict(0))[0] + else: + output_2 = model(**inputs_dict)[0] + + model.set_attn_processor(AttnProcessorNPU()) + assert all(type(proc) == AttnProcessorNPU for proc in model.attn_processors.values()) + with torch.no_grad(): + if self.forward_requires_fresh_args: + output_3 = model(**self.inputs_dict(0))[0] + else: + output_3 = model(**inputs_dict)[0] + + torch.use_deterministic_algorithms(True) + + assert torch.allclose(output, output_2, atol=self.base_precision) + assert torch.allclose(output, output_3, atol=self.base_precision) + assert torch.allclose(output_2, output_3, atol=self.base_precision) + @unittest.skipIf( torch_device != "cuda" or not is_xformers_available(), reason="XFormers attention is only available with CUDA and `xformers` installed", From 49b959b5408b97274e2ee423059d9239445aea26 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Fri, 3 May 2024 16:08:27 -0700 Subject: [PATCH 44/56] [docs] LCM (#7829) * lcm * lcm lora * fix * fix hfoption * edits --- docs/source/en/_toctree.yml | 6 +- .../en/using-diffusers/inference_with_lcm.md | 463 ++++++++++++++++-- .../inference_with_lcm_lora.md | 422 ---------------- 3 files changed, 412 insertions(+), 479 deletions(-) delete mode 100644 docs/source/en/using-diffusers/inference_with_lcm_lora.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index f2755798b792..89af55ed2a5b 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -81,16 +81,14 @@ title: ControlNet - local: using-diffusers/t2i_adapter title: T2I-Adapter + - local: using-diffusers/inference_with_lcm + title: Latent Consistency Model - local: using-diffusers/textual_inversion_inference title: Textual inversion - local: using-diffusers/shap-e title: Shap-E - local: using-diffusers/diffedit title: DiffEdit - - local: using-diffusers/inference_with_lcm_lora - title: Latent Consistency Model-LoRA - - local: using-diffusers/inference_with_lcm - title: Latent Consistency Model - local: using-diffusers/inference_with_tcd_lora title: Trajectory Consistency Distillation-LoRA - local: using-diffusers/svd diff --git a/docs/source/en/using-diffusers/inference_with_lcm.md b/docs/source/en/using-diffusers/inference_with_lcm.md index 798de67c6560..19fb349c5458 100644 --- a/docs/source/en/using-diffusers/inference_with_lcm.md +++ b/docs/source/en/using-diffusers/inference_with_lcm.md @@ -10,29 +10,30 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> +# Latent Consistency Model + [[open-in-colab]] -# Latent Consistency Model +[Latent Consistency Models (LCMs)](https://hf.co/papers/2310.04378) enable fast high-quality image generation by directly predicting the reverse diffusion process in the latent rather than pixel space. In other words, LCMs try to predict the noiseless image from the noisy image in contrast to typical diffusion models that iteratively remove noise from the noisy image. By avoiding the iterative sampling process, LCMs are able to generate high-quality images in 2-4 steps instead of 20-30 steps. -Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings. +LCMs are distilled from pretrained models which requires ~32 hours of A100 compute. To speed this up, [LCM-LoRAs](https://hf.co/papers/2311.05556) train a [LoRA adapter](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) which have much fewer parameters to train compared to the full model. The LCM-LoRA can be plugged into a diffusion model once it has been trained. -From the [official website](https://latent-consistency-models.github.io/): +This guide will show you how to use LCMs and LCM-LoRAs for fast inference on tasks and how to use them with other adapters like ControlNet or T2I-Adapter. -> LCMs can be distilled from any pre-trained Stable Diffusion (SD) in only 4,000 training steps (~32 A100 GPU Hours) for generating high quality 768 x 768 resolution images in 2~4 steps or even one step, significantly accelerating text-to-image generation. We employ LCM to distill the Dreamshaper-V7 version of SD in just 4,000 training iterations. +> [!TIP] +> LCMs and LCM-LoRAs are available for Stable Diffusion v1.5, Stable Diffusion XL, and the SSD-1B model. You can find their checkpoints on the [Latent Consistency](https://hf.co/collections/latent-consistency/latent-consistency-models-weights-654ce61a95edd6dffccef6a8) Collections. -For a more technical overview of LCMs, refer to [the paper](https://huggingface.co/papers/2310.04378). +## Text-to-image -LCM distilled models are available for [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and the [SSD-1B](https://huggingface.co/segmind/SSD-1B) model. All the checkpoints can be found in this [collection](https://huggingface.co/collections/latent-consistency/latent-consistency-models-weights-654ce61a95edd6dffccef6a8). + + -This guide shows how to perform inference with LCMs for -- text-to-image -- image-to-image -- combined with style LoRAs -- ControlNet/T2I-Adapter +To use LCMs, you need to load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the pipeline as usual, and pass a text prompt to generate an image in just 4 steps. -## Text-to-image +A couple of notes to keep in mind when using LCMs are: -You'll use the [`StableDiffusionXLPipeline`] pipeline with the [`LCMScheduler`] and then load the LCM-LoRA. Together with the LCM-LoRA and the scheduler, the pipeline enables a fast inference workflow, overcoming the slow iterative nature of diffusion models. +* Typically, batch size is doubled inside the pipeline for classifier-free guidance. But LCM applies guidance with guidance embeddings and doesn't need to double the batch size, which leads to faster inference. The downside is that negative prompts don't work with LCM because they don't have any effect on the denoising process. +* The ideal range for `guidance_scale` is [3., 13.] because that is what the UNet was trained with. However, disabling `guidance_scale` with a value of 1.0 is also effective in most cases. ```python from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, LCMScheduler @@ -49,31 +50,69 @@ pipe = StableDiffusionXLPipeline.from_pretrained( pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k" - generator = torch.manual_seed(0) image = pipe( prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=8.0 ).images[0] +image ``` -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdxl_t2i.png) +
+ +
-Notice that we use only 4 steps for generation which is way less than what's typically used for standard SDXL. +
+ + +To use LCM-LoRAs, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt to generate an image in just 4 steps. + +A couple of notes to keep in mind when using LCM-LoRAs are: + +* Typically, batch size is doubled inside the pipeline for classifier-free guidance. But LCM applies guidance with guidance embeddings and doesn't need to double the batch size, which leads to faster inference. The downside is that negative prompts don't work with LCM because they don't have any effect on the denoising process. +* You could use guidance with LCM-LoRAs, but it is very sensitive to high `guidance_scale` values and can lead to artifacts in the generated image. The best values we've found are between [1.0, 2.0]. +* Replace [stabilityai/stable-diffusion-xl-base-1.0](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0) with any finetuned model. For example, try using the [animagine-xl](https://huggingface.co/Linaqruf/animagine-xl) checkpoint to generate anime images with SDXL. + +```py +import torch +from diffusers import DiffusionPipeline, LCMScheduler + +pipe = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + variant="fp16", + torch_dtype=torch.float16 +).to("cuda") +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) +pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl") -Some details to keep in mind: +prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k" +generator = torch.manual_seed(42) +image = pipe( + prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0 +).images[0] +image +``` -* To perform classifier-free guidance, batch size is usually doubled inside the pipeline. LCM, however, applies guidance using guidance embeddings, so the batch size does not have to be doubled in this case. This leads to a faster inference time, with the drawback that negative prompts don't have any effect on the denoising process. -* The UNet was trained using the [3., 13.] guidance scale range. So, that is the ideal range for `guidance_scale`. However, disabling `guidance_scale` using a value of 1.0 is also effective in most cases. +
+ +
+
+
## Image-to-image -LCMs can be applied to image-to-image tasks too. For this example, we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model, but the same steps can be applied to other LCM models as well. + + + +To use LCMs for image-to-image, you need to load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the pipeline as usual, and pass a text prompt and initial image to generate an image in just 4 steps. + +> [!TIP] +> Experiment with different values for `num_inference_steps`, `strength`, and `guidance_scale` to get the best results. ```python import torch from diffusers import AutoPipelineForImage2Image, UNet2DConditionModel, LCMScheduler -from diffusers.utils import make_image_grid, load_image +from diffusers.utils import load_image unet = UNet2DConditionModel.from_pretrained( "SimianLuo/LCM_Dreamshaper_v7", @@ -89,12 +128,8 @@ pipe = AutoPipelineForImage2Image.from_pretrained( ).to("cuda") pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) -# prepare image -url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png" -init_image = load_image(url) +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png") prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k" - -# pass prompt and image to pipeline generator = torch.manual_seed(0) image = pipe( prompt, @@ -104,22 +139,130 @@ image = pipe( strength=0.5, generator=generator ).images[0] -make_image_grid([init_image, image], rows=1, cols=2) +image ``` -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdv1-5_i2i.png) +
+
+ +
initial image
+
+
+ +
generated image
+
+
+ +
+ +To use LCM-LoRAs for image-to-image, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt and initial image to generate an image in just 4 steps. - +> [!TIP] +> Experiment with different values for `num_inference_steps`, `strength`, and `guidance_scale` to get the best results. -You can get different results based on your prompt and the image you provide. To get the best results, we recommend trying different values for `num_inference_steps`, `strength`, and `guidance_scale` parameters and choose the best one. +```py +import torch +from diffusers import AutoPipelineForImage2Image, LCMScheduler +from diffusers.utils import make_image_grid, load_image + +pipe = AutoPipelineForImage2Image.from_pretrained( + "Lykon/dreamshaper-7", + torch_dtype=torch.float16, + variant="fp16", +).to("cuda") + +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5") + +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png") +prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k" + +generator = torch.manual_seed(0) +image = pipe( + prompt, + image=init_image, + num_inference_steps=4, + guidance_scale=1, + strength=0.6, + generator=generator +).images[0] +image +``` - +
+
+ +
initial image
+
+
+ +
generated image
+
+
+
+
-## Combine with style LoRAs +## Inpainting -LCMs can be used with other styled LoRAs to generate styled-images in very few steps (4-8). In the following example, we'll use the [papercut LoRA](TheLastBen/Papercut_SDXL). +To use LCM-LoRAs for inpainting, you need to replace the scheduler with the [`LCMScheduler`] and load the LCM-LoRA weights with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method. Then you can use the pipeline as usual, and pass a text prompt, initial image, and mask image to generate an image in just 4 steps. + +```py +import torch +from diffusers import AutoPipelineForInpainting, LCMScheduler +from diffusers.utils import load_image, make_image_grid + +pipe = AutoPipelineForInpainting.from_pretrained( + "runwayml/stable-diffusion-inpainting", + torch_dtype=torch.float16, + variant="fp16", +).to("cuda") + +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5") + +init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png") +mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png") + +prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" +generator = torch.manual_seed(0) +image = pipe( + prompt=prompt, + image=init_image, + mask_image=mask_image, + generator=generator, + num_inference_steps=4, + guidance_scale=4, +).images[0] +image +``` + +
+
+ +
initial image
+
+
+ +
generated image
+
+
+ +## Adapters + +LCMs are compatible with adapters like LoRA, ControlNet, T2I-Adapter, and AnimateDiff. You can bring the speed of LCMs to these adapters to generate images in a certain style or condition the model on another input like a canny image. + +### LoRA + +[LoRA](../using-diffusers/loading_adapters#lora) adapters can be rapidly finetuned to learn a new style from just a few images and plugged into a pretrained model to generate images in that style. + + + + +Load the LCM checkpoint for your supported model into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LoRA weights into the LCM and generate a styled image in a few steps. ```python from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, LCMScheduler @@ -134,11 +277,9 @@ pipe = StableDiffusionXLPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float16, variant="fp16", ).to("cuda") pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) - pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut") prompt = "papercut, a cute fox" - generator = torch.manual_seed(0) image = pipe( prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=8.0 @@ -146,15 +287,58 @@ image = pipe( image ``` -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdx_lora_mix.png) +
+ +
+
+ -## ControlNet/T2I-Adapter +Replace the scheduler with the [`LCMScheduler`]. Then you can use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights and the style LoRA you want to use. Combine both LoRA adapters with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method and generate a styled image in a few steps. -Let's look at how we can perform inference with ControlNet/T2I-Adapter and a LCM. +```py +import torch +from diffusers import DiffusionPipeline, LCMScheduler + +pipe = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + variant="fp16", + torch_dtype=torch.float16 +).to("cuda") + +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl", adapter_name="lcm") +pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut") + +pipe.set_adapters(["lcm", "papercut"], adapter_weights=[1.0, 0.8]) + +prompt = "papercut, a cute fox" +generator = torch.manual_seed(0) +image = pipe(prompt, num_inference_steps=4, guidance_scale=1, generator=generator).images[0] +image +``` + +
+ +
+ +
+
### ControlNet -For this example, we'll use the [LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model with canny ControlNet, but the same steps can be applied to other LCM models as well. + +[ControlNet](./controlnet) are adapters that can be trained on a variety of inputs like canny edge, pose estimation, or depth. The ControlNet can be inserted into the pipeline to provide additional conditioning and control to the model for more accurate generation. + +You can find additional ControlNet models trained on other inputs in [lllyasviel's](https://hf.co/lllyasviel) repository. + + + + +Load a ControlNet model trained on canny images and pass it to the [`ControlNetModel`]. Then you can load a LCM model into [`StableDiffusionControlNetPipeline`] and replace the scheduler with the [`LCMScheduler`]. Now pass the canny image to the pipeline and generate an image. + +> [!TIP] +> Experiment with different values for `num_inference_steps`, `controlnet_conditioning_scale`, `cross_attention_kwargs`, and `guidance_scale` to get the best results. ```python import torch @@ -186,8 +370,6 @@ pipe = StableDiffusionControlNetPipeline.from_pretrained( torch_dtype=torch.float16, safety_checker=None, ).to("cuda") - -# set scheduler pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) generator = torch.manual_seed(0) @@ -200,16 +382,84 @@ image = pipe( make_image_grid([canny_image, image], rows=1, cols=2) ``` -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdv1-5_controlnet.png) +
+ +
+
+ - -The inference parameters in this example might not work for all examples, so we recommend trying different values for the `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale`, and `cross_attention_kwargs` parameters and choosing the best one. - +Load a ControlNet model trained on canny images and pass it to the [`ControlNetModel`]. Then you can load a Stable Diffusion v1.5 model into [`StableDiffusionControlNetPipeline`] and replace the scheduler with the [`LCMScheduler`]. Use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights, and pass the canny image to the pipeline and generate an image. + +> [!TIP] +> Experiment with different values for `num_inference_steps`, `controlnet_conditioning_scale`, `cross_attention_kwargs`, and `guidance_scale` to get the best results. + +```py +import torch +import cv2 +import numpy as np +from PIL import Image + +from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, LCMScheduler +from diffusers.utils import load_image + +image = load_image( + "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png" +).resize((512, 512)) + +image = np.array(image) + +low_threshold = 100 +high_threshold = 200 + +image = cv2.Canny(image, low_threshold, high_threshold) +image = image[:, :, None] +image = np.concatenate([image, image, image], axis=2) +canny_image = Image.fromarray(image) + +controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16) +pipe = StableDiffusionControlNetPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + controlnet=controlnet, + torch_dtype=torch.float16, + safety_checker=None, + variant="fp16" +).to("cuda") + +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5") + +generator = torch.manual_seed(0) +image = pipe( + "the mona lisa", + image=canny_image, + num_inference_steps=4, + guidance_scale=1.5, + controlnet_conditioning_scale=0.8, + cross_attention_kwargs={"scale": 1}, + generator=generator, +).images[0] +image +``` + +
+ +
+ +
+
### T2I-Adapter -This example shows how to use the `lcm-sdxl` with the [Canny T2I-Adapter](TencentARC/t2i-adapter-canny-sdxl-1.0). +[T2I-Adapter](./t2i_adapter) is an even more lightweight adapter than ControlNet, that provides an additional input to condition a pretrained model with. It is faster than ControlNet but the results may be slightly worse. + +You can find additional T2I-Adapter checkpoints trained on other inputs in [TencentArc's](https://hf.co/TencentARC) repository. + + + + +Load a T2IAdapter trained on canny images and pass it to the [`StableDiffusionXLAdapterPipeline`]. Then load a LCM checkpoint into [`UNet2DConditionModel`] and replace the scheduler with the [`LCMScheduler`]. Now pass the canny image to the pipeline and generate an image. ```python import torch @@ -220,10 +470,9 @@ from PIL import Image from diffusers import StableDiffusionXLAdapterPipeline, UNet2DConditionModel, T2IAdapter, LCMScheduler from diffusers.utils import load_image, make_image_grid -# Prepare image -# Detect the canny map in low resolution to avoid high-frequency details +# detect the canny map in low resolution to avoid high-frequency details image = load_image( - "https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg" + "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png" ).resize((384, 384)) image = np.array(image) @@ -236,7 +485,6 @@ image = image[:, :, None] image = np.concatenate([image, image, image], axis=2) canny_image = Image.fromarray(image).resize((1024, 1216)) -# load adapter adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda") unet = UNet2DConditionModel.from_pretrained( @@ -254,7 +502,7 @@ pipe = StableDiffusionXLAdapterPipeline.from_pretrained( pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) -prompt = "Mystical fairy in real, magic, 4k picture, high quality" +prompt = "the mona lisa, 4k picture, high quality" negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured" generator = torch.manual_seed(0) @@ -268,7 +516,116 @@ image = pipe( adapter_conditioning_factor=1, generator=generator, ).images[0] -grid = make_image_grid([canny_image, image], rows=1, cols=2) ``` -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_full_sdxl_t2iadapter.png) +
+ +
+ +
+ + +Load a T2IAdapter trained on canny images and pass it to the [`StableDiffusionXLAdapterPipeline`]. Replace the scheduler with the [`LCMScheduler`], and use the [`~loaders.LoraLoaderMixin.load_lora_weights`] method to load the LCM-LoRA weights. Pass the canny image to the pipeline and generate an image. + +```py +import torch +import cv2 +import numpy as np +from PIL import Image + +from diffusers import StableDiffusionXLAdapterPipeline, UNet2DConditionModel, T2IAdapter, LCMScheduler +from diffusers.utils import load_image, make_image_grid + +# detect the canny map in low resolution to avoid high-frequency details +image = load_image( + "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png" +).resize((384, 384)) + +image = np.array(image) + +low_threshold = 100 +high_threshold = 200 + +image = cv2.Canny(image, low_threshold, high_threshold) +image = image[:, :, None] +image = np.concatenate([image, image, image], axis=2) +canny_image = Image.fromarray(image).resize((1024, 1024)) + +adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda") + +pipe = StableDiffusionXLAdapterPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + adapter=adapter, + torch_dtype=torch.float16, + variant="fp16", +).to("cuda") + +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl") + +prompt = "the mona lisa, 4k picture, high quality" +negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured" + +generator = torch.manual_seed(0) +image = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + image=canny_image, + num_inference_steps=4, + guidance_scale=1.5, + adapter_conditioning_scale=0.8, + adapter_conditioning_factor=1, + generator=generator, +).images[0] +``` + +
+ +
+ +
+
+ +### AnimateDiff + +[AnimateDiff](../api/pipelines/animatediff) is an adapter that adds motion to an image. It can be used with most Stable Diffusion models, effectively turning them into "video generation" models. Generating good results with a video model usually requires generating multiple frames (16-24), which can be very slow with a regular Stable Diffusion model. LCM-LoRA can speed up this process by only taking 4-8 steps for each frame. + +Load a [`AnimateDiffPipeline`] and pass a [`MotionAdapter`] to it. Then replace the scheduler with the [`LCMScheduler`], and combine both LoRA adapters with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method. Now you can pass a prompt to the pipeline and generate an animated image. + +```py +import torch +from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler, LCMScheduler +from diffusers.utils import export_to_gif + +adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5") +pipe = AnimateDiffPipeline.from_pretrained( + "frankjoshua/toonyou_beta6", + motion_adapter=adapter, +).to("cuda") + +# set scheduler +pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) + +# load LCM-LoRA +pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5", adapter_name="lcm") +pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="diffusion_pytorch_model.safetensors", adapter_name="motion-lora") + +pipe.set_adapters(["lcm", "motion-lora"], adapter_weights=[0.55, 1.2]) + +prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress" +generator = torch.manual_seed(0) +frames = pipe( + prompt=prompt, + num_inference_steps=5, + guidance_scale=1.25, + cross_attention_kwargs={"scale": 1}, + num_frames=24, + generator=generator +).frames[0] +export_to_gif(frames, "animation.gif") +``` + +
+ +
diff --git a/docs/source/en/using-diffusers/inference_with_lcm_lora.md b/docs/source/en/using-diffusers/inference_with_lcm_lora.md deleted file mode 100644 index 36120a04828c..000000000000 --- a/docs/source/en/using-diffusers/inference_with_lcm_lora.md +++ /dev/null @@ -1,422 +0,0 @@ - - -[[open-in-colab]] - -# Performing inference with LCM-LoRA - -Latent Consistency Models (LCM) enable quality image generation in typically 2-4 steps making it possible to use diffusion models in almost real-time settings. - -From the [official website](https://latent-consistency-models.github.io/): - -> LCMs can be distilled from any pre-trained Stable Diffusion (SD) in only 4,000 training steps (~32 A100 GPU Hours) for generating high quality 768 x 768 resolution images in 2~4 steps or even one step, significantly accelerating text-to-image generation. We employ LCM to distill the Dreamshaper-V7 version of SD in just 4,000 training iterations. - -For a more technical overview of LCMs, refer to [the paper](https://huggingface.co/papers/2310.04378). - -However, each model needs to be distilled separately for latent consistency distillation. The core idea with LCM-LoRA is to train just a few adapter layers, the adapter being LoRA in this case. -This way, we don't have to train the full model and keep the number of trainable parameters manageable. The resulting LoRAs can then be applied to any fine-tuned version of the model without distilling them separately. -Additionally, the LoRAs can be applied to image-to-image, ControlNet/T2I-Adapter, inpainting, AnimateDiff etc. -The LCM-LoRA can also be combined with other LoRAs to generate styled images in very few steps (4-8). - -LCM-LoRAs are available for [stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and the [SSD-1B](https://huggingface.co/segmind/SSD-1B) model. All the checkpoints can be found in this [collection](https://huggingface.co/collections/latent-consistency/latent-consistency-models-loras-654cdd24e111e16f0865fba6). - -For more details about LCM-LoRA, refer to [the technical report](https://huggingface.co/papers/2311.05556). - -This guide shows how to perform inference with LCM-LoRAs for -- text-to-image -- image-to-image -- combined with styled LoRAs -- ControlNet/T2I-Adapter -- inpainting -- AnimateDiff - -Before going through this guide, we'll take a look at the general workflow for performing inference with LCM-LoRAs. -LCM-LoRAs are similar to other Stable Diffusion LoRAs so they can be used with any [`DiffusionPipeline`] that supports LoRAs. - -- Load the task specific pipeline and model. -- Set the scheduler to [`LCMScheduler`]. -- Load the LCM-LoRA weights for the model. -- Reduce the `guidance_scale` between `[1.0, 2.0]` and set the `num_inference_steps` between [4, 8]. -- Perform inference with the pipeline with the usual parameters. - -Let's look at how we can perform inference with LCM-LoRAs for different tasks. - -First, make sure you have [peft](https://github.com/huggingface/peft) installed, for better LoRA support. - -```bash -pip install -U peft -``` - -## Text-to-image - -You'll use the [`StableDiffusionXLPipeline`] with the scheduler: [`LCMScheduler`] and then load the LCM-LoRA. Together with the LCM-LoRA and the scheduler, the pipeline enables a fast inference workflow overcoming the slow iterative nature of diffusion models. - -```python -import torch -from diffusers import DiffusionPipeline, LCMScheduler - -pipe = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", - variant="fp16", - torch_dtype=torch.float16 -).to("cuda") - -# set scheduler -pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) - -# load LCM-LoRA -pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl") - -prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k" - -generator = torch.manual_seed(42) -image = pipe( - prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0 -).images[0] -``` - -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2i.png) - -Notice that we use only 4 steps for generation which is way less than what's typically used for standard SDXL. - - - -You may have noticed that we set `guidance_scale=1.0`, which disables classifer-free-guidance. This is because the LCM-LoRA is trained with guidance, so the batch size does not have to be doubled in this case. This leads to a faster inference time, with the drawback that negative prompts don't have any effect on the denoising process. - -You can also use guidance with LCM-LoRA, but due to the nature of training the model is very sensitve to the `guidance_scale` values, high values can lead to artifacts in the generated images. In our experiments, we found that the best values are in the range of [1.0, 2.0]. - - - -### Inference with a fine-tuned model - -As mentioned above, the LCM-LoRA can be applied to any fine-tuned version of the model without having to distill them separately. Let's look at how we can perform inference with a fine-tuned model. In this example, we'll use the [animagine-xl](https://huggingface.co/Linaqruf/animagine-xl) model, which is a fine-tuned version of the SDXL model for generating anime. - -```python -from diffusers import DiffusionPipeline, LCMScheduler - -pipe = DiffusionPipeline.from_pretrained( - "Linaqruf/animagine-xl", - variant="fp16", - torch_dtype=torch.float16 -).to("cuda") - -# set scheduler -pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) - -# load LCM-LoRA -pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl") - -prompt = "face focus, cute, masterpiece, best quality, 1girl, green hair, sweater, looking at viewer, upper body, beanie, outdoors, night, turtleneck" - -generator = torch.manual_seed(0) -image = pipe( - prompt=prompt, num_inference_steps=4, generator=generator, guidance_scale=1.0 -).images[0] -``` - -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2i_finetuned.png) - - -## Image-to-image - -LCM-LoRA can be applied to image-to-image tasks too. Let's look at how we can perform image-to-image generation with LCMs. For this example we'll use the [dreamshaper-7](https://huggingface.co/Lykon/dreamshaper-7) model and the LCM-LoRA for `stable-diffusion-v1-5 `. - -```python -import torch -from diffusers import AutoPipelineForImage2Image, LCMScheduler -from diffusers.utils import make_image_grid, load_image - -pipe = AutoPipelineForImage2Image.from_pretrained( - "Lykon/dreamshaper-7", - torch_dtype=torch.float16, - variant="fp16", -).to("cuda") - -# set scheduler -pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) - -# load LCM-LoRA -pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5") - -# prepare image -url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png" -init_image = load_image(url) -prompt = "Astronauts in a jungle, cold color palette, muted colors, detailed, 8k" - -# pass prompt and image to pipeline -generator = torch.manual_seed(0) -image = pipe( - prompt, - image=init_image, - num_inference_steps=4, - guidance_scale=1, - strength=0.6, - generator=generator -).images[0] -make_image_grid([init_image, image], rows=1, cols=2) -``` - -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_i2i.png) - - - - -You can get different results based on your prompt and the image you provide. To get the best results, we recommend trying different values for `num_inference_steps`, `strength`, and `guidance_scale` parameters and choose the best one. - - - - -## Combine with styled LoRAs - -LCM-LoRA can be combined with other LoRAs to generate styled-images in very few steps (4-8). In the following example, we'll use the LCM-LoRA with the [papercut LoRA](TheLastBen/Papercut_SDXL). -To learn more about how to combine LoRAs, refer to [this guide](https://huggingface.co/docs/diffusers/tutorials/using_peft_for_inference#combine-multiple-adapters). - -```python -import torch -from diffusers import DiffusionPipeline, LCMScheduler - -pipe = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", - variant="fp16", - torch_dtype=torch.float16 -).to("cuda") - -# set scheduler -pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) - -# load LoRAs -pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl", adapter_name="lcm") -pipe.load_lora_weights("TheLastBen/Papercut_SDXL", weight_name="papercut.safetensors", adapter_name="papercut") - -# Combine LoRAs -pipe.set_adapters(["lcm", "papercut"], adapter_weights=[1.0, 0.8]) - -prompt = "papercut, a cute fox" -generator = torch.manual_seed(0) -image = pipe(prompt, num_inference_steps=4, guidance_scale=1, generator=generator).images[0] -image -``` - -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdx_lora_mix.png) - - -## ControlNet/T2I-Adapter - -Let's look at how we can perform inference with ControlNet/T2I-Adapter and LCM-LoRA. - -### ControlNet -For this example, we'll use the SD-v1-5 model and the LCM-LoRA for SD-v1-5 with canny ControlNet. - -```python -import torch -import cv2 -import numpy as np -from PIL import Image - -from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, LCMScheduler -from diffusers.utils import load_image - -image = load_image( - "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png" -).resize((512, 512)) - -image = np.array(image) - -low_threshold = 100 -high_threshold = 200 - -image = cv2.Canny(image, low_threshold, high_threshold) -image = image[:, :, None] -image = np.concatenate([image, image, image], axis=2) -canny_image = Image.fromarray(image) - -controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16) -pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - controlnet=controlnet, - torch_dtype=torch.float16, - safety_checker=None, - variant="fp16" -).to("cuda") - -# set scheduler -pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) - -# load LCM-LoRA -pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5") - -generator = torch.manual_seed(0) -image = pipe( - "the mona lisa", - image=canny_image, - num_inference_steps=4, - guidance_scale=1.5, - controlnet_conditioning_scale=0.8, - cross_attention_kwargs={"scale": 1}, - generator=generator, -).images[0] -make_image_grid([canny_image, image], rows=1, cols=2) -``` - -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_controlnet.png) - - - -The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one. - - -### T2I-Adapter - -This example shows how to use the LCM-LoRA with the [Canny T2I-Adapter](TencentARC/t2i-adapter-canny-sdxl-1.0) and SDXL. - -```python -import torch -import cv2 -import numpy as np -from PIL import Image - -from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, LCMScheduler -from diffusers.utils import load_image, make_image_grid - -# Prepare image -# Detect the canny map in low resolution to avoid high-frequency details -image = load_image( - "https://huggingface.co/Adapter/t2iadapter/resolve/main/figs_SDXLV1.0/org_canny.jpg" -).resize((384, 384)) - -image = np.array(image) - -low_threshold = 100 -high_threshold = 200 - -image = cv2.Canny(image, low_threshold, high_threshold) -image = image[:, :, None] -image = np.concatenate([image, image, image], axis=2) -canny_image = Image.fromarray(image).resize((1024, 1024)) - -# load adapter -adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda") - -pipe = StableDiffusionXLAdapterPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", - adapter=adapter, - torch_dtype=torch.float16, - variant="fp16", -).to("cuda") - -# set scheduler -pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) - -# load LCM-LoRA -pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl") - -prompt = "Mystical fairy in real, magic, 4k picture, high quality" -negative_prompt = "extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured" - -generator = torch.manual_seed(0) -image = pipe( - prompt=prompt, - negative_prompt=negative_prompt, - image=canny_image, - num_inference_steps=4, - guidance_scale=1.5, - adapter_conditioning_scale=0.8, - adapter_conditioning_factor=1, - generator=generator, -).images[0] -make_image_grid([canny_image, image], rows=1, cols=2) -``` - -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdxl_t2iadapter.png) - - -## Inpainting - -LCM-LoRA can be used for inpainting as well. - -```python -import torch -from diffusers import AutoPipelineForInpainting, LCMScheduler -from diffusers.utils import load_image, make_image_grid - -pipe = AutoPipelineForInpainting.from_pretrained( - "runwayml/stable-diffusion-inpainting", - torch_dtype=torch.float16, - variant="fp16", -).to("cuda") - -# set scheduler -pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) - -# load LCM-LoRA -pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5") - -# load base and mask image -init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png") -mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png") - -# generator = torch.Generator("cuda").manual_seed(92) -prompt = "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" -generator = torch.manual_seed(0) -image = pipe( - prompt=prompt, - image=init_image, - mask_image=mask_image, - generator=generator, - num_inference_steps=4, - guidance_scale=4, -).images[0] -make_image_grid([init_image, mask_image, image], rows=1, cols=3) -``` - -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_inpainting.png) - - -## AnimateDiff - -[`AnimateDiff`] allows you to animate images using Stable Diffusion models. To get good results, we need to generate multiple frames (16-24), and doing this with standard SD models can be very slow. -LCM-LoRA can be used to speed up the process significantly, as you just need to do 4-8 steps for each frame. Let's look at how we can perform animation with LCM-LoRA and AnimateDiff. - -```python -import torch -from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler, LCMScheduler -from diffusers.utils import export_to_gif - -adapter = MotionAdapter.from_pretrained("diffusers/animatediff-motion-adapter-v1-5") -pipe = AnimateDiffPipeline.from_pretrained( - "frankjoshua/toonyou_beta6", - motion_adapter=adapter, -).to("cuda") - -# set scheduler -pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) - -# load LCM-LoRA -pipe.load_lora_weights("latent-consistency/lcm-lora-sdv1-5", adapter_name="lcm") -pipe.load_lora_weights("guoyww/animatediff-motion-lora-zoom-in", weight_name="diffusion_pytorch_model.safetensors", adapter_name="motion-lora") - -pipe.set_adapters(["lcm", "motion-lora"], adapter_weights=[0.55, 1.2]) - -prompt = "best quality, masterpiece, 1girl, looking at viewer, blurry background, upper body, contemporary, dress" -generator = torch.manual_seed(0) -frames = pipe( - prompt=prompt, - num_inference_steps=5, - guidance_scale=1.25, - cross_attention_kwargs={"scale": 1}, - num_frames=24, - generator=generator -).frames[0] -export_to_gif(frames, "animation.gif") -``` - -![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lcm/lcm_sdv1-5_animatediff.gif) \ No newline at end of file From 7fa3e5b0f6a593d06e65f5b40ccd46acfafcfeb1 Mon Sep 17 00:00:00 2001 From: Guillaume LEGENDRE Date: Mon, 6 May 2024 14:25:24 +0200 Subject: [PATCH 45/56] Ci - change cache folder (#7867) --- .github/workflows/push_tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index 00491e54b738..d071af2b0be2 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -61,7 +61,7 @@ jobs: runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-cuda - options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0 --privileged + options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged steps: - name: Checkout diffusers uses: actions/checkout@v3 @@ -118,7 +118,7 @@ jobs: runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-cuda - options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0 + options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 defaults: run: shell: bash @@ -170,7 +170,7 @@ jobs: runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: diffusers/diffusers-pytorch-cuda - options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0 + options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 defaults: run: shell: bash From 0d23645bd120e6785a3d81ee8e053afe84bbc77f Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Mon, 6 May 2024 15:07:25 -0700 Subject: [PATCH 46/56] [docs] Distilled inference (#7834) * combine * edits --- docs/source/en/_toctree.yml | 2 - docs/source/en/optimization/fp16.md | 107 ++++++++++---- .../source/en/using-diffusers/distilled_sd.md | 133 ------------------ 3 files changed, 80 insertions(+), 162 deletions(-) delete mode 100644 docs/source/en/using-diffusers/distilled_sd.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 89af55ed2a5b..1c21d4cd9f74 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -139,8 +139,6 @@ - sections: - local: optimization/fp16 title: Speed up inference - - local: using-diffusers/distilled_sd - title: Distilled Stable Diffusion inference - local: optimization/memory title: Reduce memory usage - local: optimization/torch2.0 diff --git a/docs/source/en/optimization/fp16.md b/docs/source/en/optimization/fp16.md index 7a2cf934985c..b21b61368826 100644 --- a/docs/source/en/optimization/fp16.md +++ b/docs/source/en/optimization/fp16.md @@ -12,27 +12,23 @@ specific language governing permissions and limitations under the License. # Speed up inference -There are several ways to optimize 🤗 Diffusers for inference speed. As a general rule of thumb, we recommend using either [xFormers](xformers) or `torch.nn.functional.scaled_dot_product_attention` in PyTorch 2.0 for their memory-efficient attention. +There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attetntion](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times. - +> [!TIP] +> Optimizing for inference speed or reduced memory usage can lead to improved performance in the other category, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about lowering memory usage in the [Reduce memory usage](memory) guide. -In many cases, optimizing for speed or memory leads to improved performance in the other, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about preserving memory in the [Reduce memory usage](memory) guide. +The inference times below are obtained from generating a single 512x512 image from the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM steps on a NVIDIA A100. - +| setup | latency | speed-up | +|----------|---------|----------| +| baseline | 5.27s | x1 | +| tf32 | 4.14s | x1.27 | +| fp16 | 3.51s | x1.50 | +| combined | 3.41s | x1.54 | -The results below are obtained from generating a single 512x512 image from the prompt `a photo of an astronaut riding a horse on mars` with 50 DDIM steps on a Nvidia Titan RTX, demonstrating the speed-up you can expect. +## TensorFloat-32 -| | latency | speed-up | -| ---------------- | ------- | ------- | -| original | 9.50s | x1 | -| fp16 | 3.61s | x2.63 | -| channels last | 3.30s | x2.88 | -| traced UNet | 3.21s | x2.96 | -| memory efficient attention | 2.63s | x3.61 | - -## Use TensorFloat-32 - -On Ampere and later CUDA devices, matrix multiplications and convolutions can use the [TensorFloat-32 (TF32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode for faster, but slightly less accurate computations. By default, PyTorch enables TF32 mode for convolutions but not matrix multiplications. Unless your network requires full float32 precision, we recommend enabling TF32 for matrix multiplications. It can significantly speeds up computations with typically negligible loss in numerical accuracy. +On Ampere and later CUDA devices, matrix multiplications and convolutions can use the [TensorFloat-32 (tf32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode for faster, but slightly less accurate computations. By default, PyTorch enables tf32 mode for convolutions but not matrix multiplications. Unless your network requires full float32 precision, we recommend enabling tf32 for matrix multiplications. It can significantly speed up computations with typically negligible loss in numerical accuracy. ```python import torch @@ -40,11 +36,11 @@ import torch torch.backends.cuda.matmul.allow_tf32 = True ``` -You can learn more about TF32 in the [Mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32) guide. +Learn more about tf32 in the [Mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32) guide. ## Half-precision weights -To save GPU memory and get more speed, try loading and running the model weights directly in half-precision or float16: +To save GPU memory and get more speed, set `torch_dtype=torch.float16` to load and run the model weights directly with half-precision weights. ```Python import torch @@ -56,19 +52,76 @@ pipe = DiffusionPipeline.from_pretrained( use_safetensors=True, ) pipe = pipe.to("cuda") - -prompt = "a photo of an astronaut riding a horse on mars" -image = pipe(prompt).images[0] ``` - +> [!WARNING] +> Don't use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision. -Don't use [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision. +## Distilled model - +You could also use a distilled Stable Diffusion model and autoencoder to speed up inference. During distillation, many of the UNet's residual and attention blocks are shed to reduce the model size by 51% and improve latency on CPU/GPU by 43%. The distilled model is faster and uses less memory while generating images of comparable quality to the full Stable Diffusion model. -## Distilled model +> [!TIP] +> Read the [Open-sourcing Knowledge Distillation Code and Weights of SD-Small and SD-Tiny](https://huggingface.co/blog/sd_distillation) blog post to learn more about how knowledge distillation training works to produce a faster, smaller, and cheaper generative model. -You could also use a distilled Stable Diffusion model and autoencoder to speed up inference. During distillation, many of the UNet's residual and attention blocks are shed to reduce the model size. The distilled model is faster and uses less memory while generating images of comparable quality to the full Stable Diffusion model. +The inference times below are obtained from generating 4 images from the prompt "a photo of an astronaut riding a horse on mars" with 25 PNDM steps on a NVIDIA A100. Each generation is repeated 3 times with the distilled Stable Diffusion v1.4 model by [Nota AI](https://hf.co/nota-ai). + +| setup | latency | speed-up | +|------------------------------|---------|----------| +| baseline | 6.37s | x1 | +| distilled | 4.18s | x1.52 | +| distilled + tiny autoencoder | 3.83s | x1.66 | + +Let's load the distilled Stable Diffusion model and compare it against the original Stable Diffusion model. + +```py +from diffusers import StableDiffusionPipeline +import torch + +distilled = StableDiffusionPipeline.from_pretrained( + "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True, +).to("cuda") +prompt = "a golden vase with different flowers" +generator = torch.manual_seed(2023) +image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0] +image +``` + +
+
+ +
original Stable Diffusion
+
+
+ +
distilled Stable Diffusion
+
+
+ +### Tiny AutoEncoder + +To speed inference up even more, replace the autoencoder with a [distilled version](https://huggingface.co/sayakpaul/taesdxl-diffusers) of it. + +```py +import torch +from diffusers import AutoencoderTiny, StableDiffusionPipeline + +distilled = StableDiffusionPipeline.from_pretrained( + "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True, +).to("cuda") +distilled.vae = AutoencoderTiny.from_pretrained( + "sayakpaul/taesd-diffusers", torch_dtype=torch.float16, use_safetensors=True, +).to("cuda") + +prompt = "a golden vase with different flowers" +generator = torch.manual_seed(2023) +image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0] +image +``` -Learn more about in the [Distilled Stable Diffusion inference](../using-diffusers/distilled_sd) guide! +
+
+ +
distilled Stable Diffusion + Tiny AutoEncoder
+
+
diff --git a/docs/source/en/using-diffusers/distilled_sd.md b/docs/source/en/using-diffusers/distilled_sd.md deleted file mode 100644 index c4c5f7ad1998..000000000000 --- a/docs/source/en/using-diffusers/distilled_sd.md +++ /dev/null @@ -1,133 +0,0 @@ - - -# Distilled Stable Diffusion inference - -[[open-in-colab]] - -Stable Diffusion inference can be a computationally intensive process because it must iteratively denoise the latents to generate an image. To reduce the computational burden, you can use a *distilled* version of the Stable Diffusion model from [Nota AI](https://huggingface.co/nota-ai). The distilled version of their Stable Diffusion model eliminates some of the residual and attention blocks from the UNet, reducing the model size by 51% and improving latency on CPU/GPU by 43%. - - - -Read this [blog post](https://huggingface.co/blog/sd_distillation) to learn more about how knowledge distillation training works to produce a faster, smaller, and cheaper generative model. - - - -Let's load the distilled Stable Diffusion model and compare it against the original Stable Diffusion model: - -```py -from diffusers import StableDiffusionPipeline -import torch - -distilled = StableDiffusionPipeline.from_pretrained( - "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True, -).to("cuda") - -original = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16, use_safetensors=True, -).to("cuda") -``` - -Given a prompt, get the inference time for the original model: - -```py -import time - -seed = 2023 -generator = torch.manual_seed(seed) - -NUM_ITERS_TO_RUN = 3 -NUM_INFERENCE_STEPS = 25 -NUM_IMAGES_PER_PROMPT = 4 - -prompt = "a golden vase with different flowers" - -start = time.time_ns() -for _ in range(NUM_ITERS_TO_RUN): - images = original( - prompt, - num_inference_steps=NUM_INFERENCE_STEPS, - generator=generator, - num_images_per_prompt=NUM_IMAGES_PER_PROMPT - ).images -end = time.time_ns() -original_sd = f"{(end - start) / 1e6:.1f}" - -print(f"Execution time -- {original_sd} ms\n") -"Execution time -- 45781.5 ms" -``` - -Time the distilled model inference: - -```py -start = time.time_ns() -for _ in range(NUM_ITERS_TO_RUN): - images = distilled( - prompt, - num_inference_steps=NUM_INFERENCE_STEPS, - generator=generator, - num_images_per_prompt=NUM_IMAGES_PER_PROMPT - ).images -end = time.time_ns() - -distilled_sd = f"{(end - start) / 1e6:.1f}" -print(f"Execution time -- {distilled_sd} ms\n") -"Execution time -- 29884.2 ms" -``` - -
-
- -
original Stable Diffusion (45781.5 ms)
-
-
- -
distilled Stable Diffusion (29884.2 ms)
-
-
- -## Tiny AutoEncoder - -To speed inference up even more, use a tiny distilled version of the [Stable Diffusion VAE](https://huggingface.co/sayakpaul/taesdxl-diffusers) to denoise the latents into images. Replace the VAE in the distilled Stable Diffusion model with the tiny VAE: - -```py -from diffusers import AutoencoderTiny - -distilled.vae = AutoencoderTiny.from_pretrained( - "sayakpaul/taesd-diffusers", torch_dtype=torch.float16, use_safetensors=True, -).to("cuda") -``` - -Time the distilled model and distilled VAE inference: - -```py -start = time.time_ns() -for _ in range(NUM_ITERS_TO_RUN): - images = distilled( - prompt, - num_inference_steps=NUM_INFERENCE_STEPS, - generator=generator, - num_images_per_prompt=NUM_IMAGES_PER_PROMPT - ).images -end = time.time_ns() - -distilled_tiny_sd = f"{(end - start) / 1e6:.1f}" -print(f"Execution time -- {distilled_tiny_sd} ms\n") -"Execution time -- 27165.7 ms" -``` - -
-
- -
distilled Stable Diffusion + Tiny AutoEncoder (27165.7 ms)
-
-
From 23e091564fbfea1d3b56b5e293f4244367f65a8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Somoza?= Date: Tue, 7 May 2024 07:54:57 -0400 Subject: [PATCH 47/56] Fix for "no lora weight found module" with some loras (#7875) * return layer weight if not found * better system and test * key example and typo --- src/diffusers/utils/peft_utils.py | 8 +++++++- tests/lora/test_lora_layers_sdxl.py | 30 +++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/diffusers/utils/peft_utils.py b/src/diffusers/utils/peft_utils.py index 8ea12e2e3b3f..ca55192ff7ae 100644 --- a/src/diffusers/utils/peft_utils.py +++ b/src/diffusers/utils/peft_utils.py @@ -246,7 +246,13 @@ def get_module_weight(weight_for_adapter, module_name): for layer_name, weight_ in weight_for_adapter.items(): if layer_name in module_name: return weight_ - raise RuntimeError(f"No LoRA weight found for module {module_name}.") + + parts = module_name.split(".") + # e.g. key = "down_blocks.1.attentions.0" + key = f"{parts[0]}.{parts[1]}.attentions.{parts[3]}" + block_weight = weight_for_adapter.get(key, 1.0) + + return block_weight # iterate over each adapter, make it active and set the corresponding scaling weight for adapter_name, weight in zip(adapter_names, weights): diff --git a/tests/lora/test_lora_layers_sdxl.py b/tests/lora/test_lora_layers_sdxl.py index b46b887d10fb..a8b2d2759f41 100644 --- a/tests/lora/test_lora_layers_sdxl.py +++ b/tests/lora/test_lora_layers_sdxl.py @@ -202,6 +202,36 @@ def test_sdxl_1_0_lora(self): pipe.unload_lora_weights() release_memory(pipe) + def test_sdxl_1_0_blockwise_lora(self): + generator = torch.Generator("cpu").manual_seed(0) + + pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0") + pipe.enable_model_cpu_offload() + lora_model_id = "hf-internal-testing/sdxl-1.0-lora" + lora_filename = "sd_xl_offset_example-lora_1.0.safetensors" + pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, adapter_name="offset") + scales = { + "unet": { + "down": {"block_1": [1.0, 1.0], "block_2": [1.0, 1.0]}, + "mid": 1.0, + "up": {"block_0": [1.0, 1.0, 1.0], "block_1": [1.0, 1.0, 1.0]}, + }, + } + pipe.set_adapters(["offset"], [scales]) + + images = pipe( + "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2 + ).images + + images = images[0, -3:, -3:, -1].flatten() + expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535]) + + max_diff = numpy_cosine_similarity_distance(expected, images) + assert max_diff < 1e-4 + + pipe.unload_lora_weights() + release_memory(pipe) + def test_sdxl_lcm_lora(self): pipe = StableDiffusionXLPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 From 8edaf3b79c564a56fbffb003ee74ac92e9162a73 Mon Sep 17 00:00:00 2001 From: Bagheera <59658056+bghira@users.noreply.github.com> Date: Tue, 7 May 2024 10:36:39 -0600 Subject: [PATCH 48/56] 7879 - adjust documentation to use naruto dataset, since pokemon is now gated (#7880) * 7879 - adjust documentation to use naruto dataset, since pokemon is now gated * replace references to pokemon in docs * more references to pokemon replaced * Japanese translation update --------- Co-authored-by: bghira --- docs/source/en/training/kandinsky.md | 20 +++++++++---------- docs/source/en/training/lora.md | 12 +++++------ docs/source/en/training/sdxl.md | 14 ++++++------- docs/source/en/training/text2image.md | 16 +++++++-------- docs/source/en/training/wuerstchen.md | 10 +++++----- docs/source/ko/training/lora.md | 8 ++++---- docs/source/ko/training/text2image.md | 18 ++++++++--------- .../ko/training/unconditional_training.md | 10 +++++----- .../consistency_distillation/README_sdxl.md | 4 ++-- .../train_lcm_distill_lora_sdxl.py | 2 +- examples/kandinsky2_2/text_to_image/README.md | 12 +++++------ .../train_text_to_image_lora_prior.py | 2 +- .../train_text_to_image_prior.py | 2 +- examples/research_projects/lora/README.md | 4 ++-- .../lora/train_text_to_image_lora.py | 2 +- .../onnxruntime/text_to_image/README.md | 2 +- .../text_to_image/train_text_to_image.py | 2 +- .../text_to_image/train_text_to_image.py | 2 +- .../text_to_image/train_text_to_image_lora.py | 2 +- .../train_text_to_image_lora_sdxl.py | 2 +- .../text_to_image/train_text_to_image_sdxl.py | 2 +- examples/text_to_image/README.md | 10 +++++----- examples/text_to_image/README_sdxl.md | 10 +++++----- examples/text_to_image/train_text_to_image.py | 2 +- .../text_to_image/train_text_to_image_flax.py | 2 +- .../text_to_image/train_text_to_image_lora.py | 2 +- .../train_text_to_image_lora_sdxl.py | 2 +- .../text_to_image/train_text_to_image_sdxl.py | 2 +- examples/wuerstchen/text_to_image/README.md | 6 +++--- .../train_text_to_image_lora_prior.py | 2 +- .../train_text_to_image_prior.py | 2 +- 31 files changed, 94 insertions(+), 94 deletions(-) diff --git a/docs/source/en/training/kandinsky.md b/docs/source/en/training/kandinsky.md index 2caec1035fa9..a1854d76c492 100644 --- a/docs/source/en/training/kandinsky.md +++ b/docs/source/en/training/kandinsky.md @@ -205,7 +205,7 @@ model_pred = unet(noisy_latents, timesteps, None, added_cond_kwargs=added_cond_k Once you’ve made all your changes or you’re okay with the default configuration, you’re ready to launch the training script! 🚀 -You'll train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon, but you can also create and train on your own dataset by following the [Create a dataset for training](create_dataset) guide. Set the environment variable `DATASET_NAME` to the name of the dataset on the Hub or if you're training on your own files, set the environment variable `TRAIN_DIR` to a path to your dataset. +You'll train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters, but you can also create and train on your own dataset by following the [Create a dataset for training](create_dataset) guide. Set the environment variable `DATASET_NAME` to the name of the dataset on the Hub or if you're training on your own files, set the environment variable `TRAIN_DIR` to a path to your dataset. If you’re training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command. @@ -219,7 +219,7 @@ To monitor training progress with Weights & Biases, add the `--report_to=wandb` ```bash -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" accelerate launch --mixed_precision="fp16" train_text_to_image_prior.py \ --dataset_name=$DATASET_NAME \ @@ -232,17 +232,17 @@ accelerate launch --mixed_precision="fp16" train_text_to_image_prior.py \ --checkpoints_total_limit=3 \ --lr_scheduler="constant" \ --lr_warmup_steps=0 \ - --validation_prompts="A robot pokemon, 4k photo" \ + --validation_prompts="A robot naruto, 4k photo" \ --report_to="wandb" \ --push_to_hub \ - --output_dir="kandi2-prior-pokemon-model" + --output_dir="kandi2-prior-naruto-model" ``` ```bash -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" accelerate launch --mixed_precision="fp16" train_text_to_image_decoder.py \ --dataset_name=$DATASET_NAME \ @@ -256,10 +256,10 @@ accelerate launch --mixed_precision="fp16" train_text_to_image_decoder.py \ --checkpoints_total_limit=3 \ --lr_scheduler="constant" \ --lr_warmup_steps=0 \ - --validation_prompts="A robot pokemon, 4k photo" \ + --validation_prompts="A robot naruto, 4k photo" \ --report_to="wandb" \ --push_to_hub \ - --output_dir="kandi2-decoder-pokemon-model" + --output_dir="kandi2-decoder-naruto-model" ``` @@ -279,7 +279,7 @@ prior_components = {"prior_" + k: v for k,v in prior_pipeline.components.items() pipeline = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", **prior_components, torch_dtype=torch.float16) pipe.enable_model_cpu_offload() -prompt="A robot pokemon, 4k photo" +prompt="A robot naruto, 4k photo" image = pipeline(prompt=prompt, negative_prompt=negative_prompt).images[0] ``` @@ -299,7 +299,7 @@ import torch pipeline = AutoPipelineForText2Image.from_pretrained("path/to/saved/model", torch_dtype=torch.float16) pipeline.enable_model_cpu_offload() -prompt="A robot pokemon, 4k photo" +prompt="A robot naruto, 4k photo" image = pipeline(prompt=prompt).images[0] ``` @@ -313,7 +313,7 @@ unet = UNet2DConditionModel.from_pretrained("path/to/saved/model" + "/checkpoint pipeline = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", unet=unet, torch_dtype=torch.float16) pipeline.enable_model_cpu_offload() -image = pipeline(prompt="A robot pokemon, 4k photo").images[0] +image = pipeline(prompt="A robot naruto, 4k photo").images[0] ```
diff --git a/docs/source/en/training/lora.md b/docs/source/en/training/lora.md index 78ac8a140e7c..737e6f0dfc32 100644 --- a/docs/source/en/training/lora.md +++ b/docs/source/en/training/lora.md @@ -170,7 +170,7 @@ Aside from setting up the LoRA layers, the training script is more or less the s Once you've made all your changes or you're okay with the default configuration, you're ready to launch the training script! 🚀 -Let's train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate our own Pokémon. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and dataset respectively. You should also specify where to save the model in `OUTPUT_DIR`, and the name of the model to save to on the Hub with `HUB_MODEL_ID`. The script creates and saves the following files to your repository: +Let's train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and dataset respectively. You should also specify where to save the model in `OUTPUT_DIR`, and the name of the model to save to on the Hub with `HUB_MODEL_ID`. The script creates and saves the following files to your repository: - saved model checkpoints - `pytorch_lora_weights.safetensors` (the trained LoRA weights) @@ -185,9 +185,9 @@ A full training run takes ~5 hours on a 2080 Ti GPU with 11GB of VRAM. ```bash export MODEL_NAME="runwayml/stable-diffusion-v1-5" -export OUTPUT_DIR="/sddata/finetune/lora/pokemon" -export HUB_MODEL_ID="pokemon-lora" -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export OUTPUT_DIR="/sddata/finetune/lora/naruto" +export HUB_MODEL_ID="naruto-lora" +export DATASET_NAME="lambdalabs/naruto-blip-captions" accelerate launch --mixed_precision="fp16" train_text_to_image_lora.py \ --pretrained_model_name_or_path=$MODEL_NAME \ @@ -208,7 +208,7 @@ accelerate launch --mixed_precision="fp16" train_text_to_image_lora.py \ --hub_model_id=${HUB_MODEL_ID} \ --report_to=wandb \ --checkpointing_steps=500 \ - --validation_prompt="A pokemon with blue eyes." \ + --validation_prompt="A naruto with blue eyes." \ --seed=1337 ``` @@ -220,7 +220,7 @@ import torch pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda") pipeline.load_lora_weights("path/to/lora/model", weight_name="pytorch_lora_weights.safetensors") -image = pipeline("A pokemon with blue eyes").images[0] +image = pipeline("A naruto with blue eyes").images[0] ``` ## Next steps diff --git a/docs/source/en/training/sdxl.md b/docs/source/en/training/sdxl.md index 0e51e720b48c..78178047d9fd 100644 --- a/docs/source/en/training/sdxl.md +++ b/docs/source/en/training/sdxl.md @@ -176,7 +176,7 @@ If you want to learn more about how the training loop works, check out the [Unde Once you’ve made all your changes or you’re okay with the default configuration, you’re ready to launch the training script! 🚀 -Let’s train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and the dataset (either from the Hub or a local path). You should also specify a VAE other than the SDXL VAE (either from the Hub or a local path) with `VAE_NAME` to avoid numerical instabilities. +Let’s train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `DATASET_NAME` to the model and the dataset (either from the Hub or a local path). You should also specify a VAE other than the SDXL VAE (either from the Hub or a local path) with `VAE_NAME` to avoid numerical instabilities. @@ -187,7 +187,7 @@ To monitor training progress with Weights & Biases, add the `--report_to=wandb` ```bash export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0" export VAE_NAME="madebyollin/sdxl-vae-fp16-fix" -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" accelerate launch train_text_to_image_sdxl.py \ --pretrained_model_name_or_path=$MODEL_NAME \ @@ -211,7 +211,7 @@ accelerate launch train_text_to_image_sdxl.py \ --validation_prompt="a cute Sundar Pichai creature" \ --validation_epochs 5 \ --checkpointing_steps=5000 \ - --output_dir="sdxl-pokemon-model" \ + --output_dir="sdxl-naruto-model" \ --push_to_hub ``` @@ -226,9 +226,9 @@ import torch pipeline = DiffusionPipeline.from_pretrained("path/to/your/model", torch_dtype=torch.float16).to("cuda") -prompt = "A pokemon with green eyes and red legs." +prompt = "A naruto with green eyes and red legs." image = pipeline(prompt, num_inference_steps=30, guidance_scale=7.5).images[0] -image.save("pokemon.png") +image.save("naruto.png") ``` @@ -244,11 +244,11 @@ import torch_xla.core.xla_model as xm device = xm.xla_device() pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0").to(device) -prompt = "A pokemon with green eyes and red legs." +prompt = "A naruto with green eyes and red legs." start = time() image = pipeline(prompt, num_inference_steps=inference_steps).images[0] print(f'Compilation time is {time()-start} sec') -image.save("pokemon.png") +image.save("naruto.png") start = time() image = pipeline(prompt, num_inference_steps=inference_steps).images[0] diff --git a/docs/source/en/training/text2image.md b/docs/source/en/training/text2image.md index d5c772c9db86..f69e9a710e8f 100644 --- a/docs/source/en/training/text2image.md +++ b/docs/source/en/training/text2image.md @@ -158,7 +158,7 @@ Once you've made all your changes or you're okay with the default configuration, -Let's train on the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset to generate your own Pokémon. Set the environment variables `MODEL_NAME` and `dataset_name` to the model and the dataset (either from the Hub or a local path). If you're training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command. +Let's train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `dataset_name` to the model and the dataset (either from the Hub or a local path). If you're training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command. @@ -168,7 +168,7 @@ To train on a local dataset, set the `TRAIN_DIR` and `OUTPUT_DIR` environment va ```bash export MODEL_NAME="runwayml/stable-diffusion-v1-5" -export dataset_name="lambdalabs/pokemon-blip-captions" +export dataset_name="lambdalabs/naruto-blip-captions" accelerate launch --mixed_precision="fp16" train_text_to_image.py \ --pretrained_model_name_or_path=$MODEL_NAME \ @@ -183,7 +183,7 @@ accelerate launch --mixed_precision="fp16" train_text_to_image.py \ --max_grad_norm=1 \ --enable_xformers_memory_efficient_attention --lr_scheduler="constant" --lr_warmup_steps=0 \ - --output_dir="sd-pokemon-model" \ + --output_dir="sd-naruto-model" \ --push_to_hub ``` @@ -202,7 +202,7 @@ To train on a local dataset, set the `TRAIN_DIR` and `OUTPUT_DIR` environment va ```bash export MODEL_NAME="runwayml/stable-diffusion-v1-5" -export dataset_name="lambdalabs/pokemon-blip-captions" +export dataset_name="lambdalabs/naruto-blip-captions" python train_text_to_image_flax.py \ --pretrained_model_name_or_path=$MODEL_NAME \ @@ -212,7 +212,7 @@ python train_text_to_image_flax.py \ --max_train_steps=15000 \ --learning_rate=1e-05 \ --max_grad_norm=1 \ - --output_dir="sd-pokemon-model" \ + --output_dir="sd-naruto-model" \ --push_to_hub ``` @@ -231,7 +231,7 @@ import torch pipeline = StableDiffusionPipeline.from_pretrained("path/to/saved_model", torch_dtype=torch.float16, use_safetensors=True).to("cuda") image = pipeline(prompt="yoda").images[0] -image.save("yoda-pokemon.png") +image.save("yoda-naruto.png") ``` @@ -246,7 +246,7 @@ from diffusers import FlaxStableDiffusionPipeline pipeline, params = FlaxStableDiffusionPipeline.from_pretrained("path/to/saved_model", dtype=jax.numpy.bfloat16) -prompt = "yoda pokemon" +prompt = "yoda naruto" prng_seed = jax.random.PRNGKey(0) num_inference_steps = 50 @@ -261,7 +261,7 @@ prompt_ids = shard(prompt_ids) images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:]))) -image.save("yoda-pokemon.png") +image.save("yoda-naruto.png") ``` diff --git a/docs/source/en/training/wuerstchen.md b/docs/source/en/training/wuerstchen.md index c8d2842eb833..cd190639b865 100644 --- a/docs/source/en/training/wuerstchen.md +++ b/docs/source/en/training/wuerstchen.md @@ -131,7 +131,7 @@ If you want to learn more about how the training loop works, check out the [Unde Once you’ve made all your changes or you’re okay with the default configuration, you’re ready to launch the training script! 🚀 -Set the `DATASET_NAME` environment variable to the dataset name from the Hub. This guide uses the [Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) dataset, but you can create and train on your own datasets as well (see the [Create a dataset for training](create_dataset) guide). +Set the `DATASET_NAME` environment variable to the dataset name from the Hub. This guide uses the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset, but you can create and train on your own datasets as well (see the [Create a dataset for training](create_dataset) guide). @@ -140,7 +140,7 @@ To monitor training progress with Weights & Biases, add the `--report_to=wandb` ```bash -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" accelerate launch train_text_to_image_prior.py \ --mixed_precision="fp16" \ @@ -156,10 +156,10 @@ accelerate launch train_text_to_image_prior.py \ --checkpoints_total_limit=3 \ --lr_scheduler="constant" \ --lr_warmup_steps=0 \ - --validation_prompts="A robot pokemon, 4k photo" \ + --validation_prompts="A robot naruto, 4k photo" \ --report_to="wandb" \ --push_to_hub \ - --output_dir="wuerstchen-prior-pokemon-model" + --output_dir="wuerstchen-prior-naruto-model" ``` Once training is complete, you can use your newly trained model for inference! @@ -171,7 +171,7 @@ from diffusers.pipelines.wuerstchen import DEFAULT_STAGE_C_TIMESTEPS pipeline = AutoPipelineForText2Image.from_pretrained("path/to/saved/model", torch_dtype=torch.float16).to("cuda") -caption = "A cute bird pokemon holding a shield" +caption = "A cute bird naruto holding a shield" images = pipeline( caption, width=1024, diff --git a/docs/source/ko/training/lora.md b/docs/source/ko/training/lora.md index 5bb8a1e69be4..e9c690d80652 100644 --- a/docs/source/ko/training/lora.md +++ b/docs/source/ko/training/lora.md @@ -49,15 +49,15 @@ huggingface-cli login ### 학습[[dreambooth-training]] -[Pokémon BLIP 캡션](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) 데이터셋으로 [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)를 파인튜닝해 나만의 포켓몬을 생성해 보겠습니다. +[Naruto BLIP 캡션](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) 데이터셋으로 [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)를 파인튜닝해 나만의 포켓몬을 생성해 보겠습니다. 시작하려면 `MODEL_NAME` 및 `DATASET_NAME` 환경 변수가 설정되어 있는지 확인하십시오. `OUTPUT_DIR` 및 `HUB_MODEL_ID` 변수는 선택 사항이며 허브에서 모델을 저장할 위치를 지정합니다. ```bash export MODEL_NAME="runwayml/stable-diffusion-v1-5" -export OUTPUT_DIR="/sddata/finetune/lora/pokemon" -export HUB_MODEL_ID="pokemon-lora" -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export OUTPUT_DIR="/sddata/finetune/lora/naruto" +export HUB_MODEL_ID="naruto-lora" +export DATASET_NAME="lambdalabs/naruto-blip-captions" ``` 학습을 시작하기 전에 알아야 할 몇 가지 플래그가 있습니다. diff --git a/docs/source/ko/training/text2image.md b/docs/source/ko/training/text2image.md index f2ad3bb0719e..8a0463b497f4 100644 --- a/docs/source/ko/training/text2image.md +++ b/docs/source/ko/training/text2image.md @@ -73,12 +73,12 @@ xFormers는 Flax에 사용할 수 없습니다. -다음과 같이 [Pokémon BLIP 캡션](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) 데이터셋에서 파인튜닝 실행을 위해 [PyTorch 학습 스크립트](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py)를 실행합니다: +다음과 같이 [Naruto BLIP 캡션](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) 데이터셋에서 파인튜닝 실행을 위해 [PyTorch 학습 스크립트](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py)를 실행합니다: ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export dataset_name="lambdalabs/pokemon-blip-captions" +export dataset_name="lambdalabs/naruto-blip-captions" accelerate launch train_text_to_image.py \ --pretrained_model_name_or_path=$MODEL_NAME \ @@ -93,7 +93,7 @@ accelerate launch train_text_to_image.py \ --learning_rate=1e-05 \ --max_grad_norm=1 \ --lr_scheduler="constant" --lr_warmup_steps=0 \ - --output_dir="sd-pokemon-model" + --output_dir="sd-naruto-model" ``` 자체 데이터셋으로 파인튜닝하려면 🤗 [Datasets](https://huggingface.co/docs/datasets/index)에서 요구하는 형식에 따라 데이터셋을 준비하세요. [데이터셋을 허브에 업로드](https://huggingface.co/docs/datasets/image_dataset#upload-dataset-to-the-hub)하거나 [파일들이 있는 로컬 폴더를 준비](https ://huggingface.co/docs/datasets/image_dataset#imagefolder)할 수 있습니다. @@ -136,7 +136,7 @@ pip install -U -r requirements_flax.txt ```bash export MODEL_NAME="runwayml/stable-diffusion-v1-5" -export dataset_name="lambdalabs/pokemon-blip-captions" +export dataset_name="lambdalabs/naruto-blip-captions" python train_text_to_image_flax.py \ --pretrained_model_name_or_path=$MODEL_NAME \ @@ -146,7 +146,7 @@ python train_text_to_image_flax.py \ --max_train_steps=15000 \ --learning_rate=1e-05 \ --max_grad_norm=1 \ - --output_dir="sd-pokemon-model" + --output_dir="sd-naruto-model" ``` 자체 데이터셋으로 파인튜닝하려면 🤗 [Datasets](https://huggingface.co/docs/datasets/index)에서 요구하는 형식에 따라 데이터셋을 준비하세요. [데이터셋을 허브에 업로드](https://huggingface.co/docs/datasets/image_dataset#upload-dataset-to-the-hub)하거나 [파일들이 있는 로컬 폴더를 준비](https ://huggingface.co/docs/datasets/image_dataset#imagefolder)할 수 있습니다. @@ -166,7 +166,7 @@ python train_text_to_image_flax.py \ --max_train_steps=15000 \ --learning_rate=1e-05 \ --max_grad_norm=1 \ - --output_dir="sd-pokemon-model" + --output_dir="sd-naruto-model" ``` @@ -189,7 +189,7 @@ pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.flo pipe.to("cuda") image = pipe(prompt="yoda").images[0] -image.save("yoda-pokemon.png") +image.save("yoda-naruto.png") ``` @@ -203,7 +203,7 @@ from diffusers import FlaxStableDiffusionPipeline model_path = "path_to_saved_model" pipe, params = FlaxStableDiffusionPipeline.from_pretrained(model_path, dtype=jax.numpy.bfloat16) -prompt = "yoda pokemon" +prompt = "yoda naruto" prng_seed = jax.random.PRNGKey(0) num_inference_steps = 50 @@ -218,7 +218,7 @@ prompt_ids = shard(prompt_ids) images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:]))) -image.save("yoda-pokemon.png") +image.save("yoda-naruto.png") ``` \ No newline at end of file diff --git a/docs/source/ko/training/unconditional_training.md b/docs/source/ko/training/unconditional_training.md index d0c200ef2daa..de9ae39a7a76 100644 --- a/docs/source/ko/training/unconditional_training.md +++ b/docs/source/ko/training/unconditional_training.md @@ -103,13 +103,13 @@ accelerate launch train_unconditional.py \
-[Pokemon](https://huggingface.co/datasets/huggan/pokemon) 데이터셋을 사용할 경우: +[Naruto](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) 데이터셋을 사용할 경우: ```bash accelerate launch train_unconditional.py \ - --dataset_name="huggan/pokemon" \ + --dataset_name="lambdalabs/naruto-blip-captions" \ --resolution=64 \ - --output_dir="ddpm-ema-pokemon-64" \ + --output_dir="ddpm-ema-naruto-64" \ --train_batch_size=16 \ --num_epochs=100 \ --gradient_accumulation_steps=1 \ @@ -129,9 +129,9 @@ accelerate launch train_unconditional.py \ ```bash accelerate launch --mixed_precision="fp16" --multi_gpu train_unconditional.py \ - --dataset_name="huggan/pokemon" \ + --dataset_name="lambdalabs/naruto-blip-captions" \ --resolution=64 --center_crop --random_flip \ - --output_dir="ddpm-ema-pokemon-64" \ + --output_dir="ddpm-ema-naruto-64" \ --train_batch_size=16 \ --num_epochs=100 \ --gradient_accumulation_steps=1 \ diff --git a/examples/consistency_distillation/README_sdxl.md b/examples/consistency_distillation/README_sdxl.md index d3abaa4ce175..6bd84727cf31 100644 --- a/examples/consistency_distillation/README_sdxl.md +++ b/examples/consistency_distillation/README_sdxl.md @@ -115,11 +115,11 @@ accelerate launch train_lcm_distill_lora_sdxl_wds.py \ We provide another version for LCM LoRA SDXL that follows best practices of `peft` and leverages the `datasets` library for quick experimentation. The script doesn't load two UNets unlike `train_lcm_distill_lora_sdxl_wds.py` which reduces the memory requirements quite a bit. -Below is an example training command that trains an LCM LoRA on the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions): +Below is an example training command that trains an LCM LoRA on the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions): ```bash export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0" -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" export VAE_PATH="madebyollin/sdxl-vae-fp16-fix" accelerate launch train_lcm_distill_lora_sdxl.py \ diff --git a/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py b/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py index 9405c238f937..56f83f47b84c 100644 --- a/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py +++ b/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py @@ -71,7 +71,7 @@ logger = get_logger(__name__) DATASET_NAME_MAPPING = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), + "lambdalabs/naruto-blip-captions": ("image", "text"), } diff --git a/examples/kandinsky2_2/text_to_image/README.md b/examples/kandinsky2_2/text_to_image/README.md index 6e5a1835593f..d27ba1a21f0c 100644 --- a/examples/kandinsky2_2/text_to_image/README.md +++ b/examples/kandinsky2_2/text_to_image/README.md @@ -57,7 +57,7 @@ To disable wandb logging, remove the `--report_to=="wandb"` and `--validation_pr ```bash -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" accelerate launch --mixed_precision="fp16" train_text_to_image_decoder.py \ --dataset_name=$DATASET_NAME \ @@ -139,7 +139,7 @@ You can fine-tune the Kandinsky prior model with `train_text_to_image_prior.py` ```bash -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" accelerate launch --mixed_precision="fp16" train_text_to_image_prior.py \ --dataset_name=$DATASET_NAME \ @@ -183,7 +183,7 @@ If you want to use a fine-tuned decoder checkpoint along with your fine-tuned pr for running distributed training with `accelerate`. Here is an example command: ```bash -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" accelerate launch --mixed_precision="fp16" --multi_gpu train_text_to_image_decoder.py \ --dataset_name=$DATASET_NAME \ @@ -227,13 +227,13 @@ on consumer GPUs like Tesla T4, Tesla V100. ### Training -First, you need to set up your development environment as explained in the [installation](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Kandinsky 2.2](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions). +First, you need to set up your development environment as explained in the [installation](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Kandinsky 2.2](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions). #### Train decoder ```bash -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" accelerate launch --mixed_precision="fp16" train_text_to_image_decoder_lora.py \ --dataset_name=$DATASET_NAME --caption_column="text" \ @@ -252,7 +252,7 @@ accelerate launch --mixed_precision="fp16" train_text_to_image_decoder_lora.py \ #### Train prior ```bash -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" accelerate launch --mixed_precision="fp16" train_text_to_image_prior_lora.py \ --dataset_name=$DATASET_NAME --caption_column="text" \ diff --git a/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py b/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py index e169cf92beb9..f6f3896aaa12 100644 --- a/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py +++ b/examples/kandinsky2_2/text_to_image/train_text_to_image_lora_prior.py @@ -332,7 +332,7 @@ def parse_args(): DATASET_NAME_MAPPING = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), + "lambdalabs/naruto-blip-captions": ("image", "text"), } diff --git a/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py b/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py index bd95aed2939c..54a4d0a397b4 100644 --- a/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py +++ b/examples/kandinsky2_2/text_to_image/train_text_to_image_prior.py @@ -56,7 +56,7 @@ logger = get_logger(__name__, log_level="INFO") DATASET_NAME_MAPPING = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), + "lambdalabs/naruto-blip-captions": ("image", "text"), } diff --git a/examples/research_projects/lora/README.md b/examples/research_projects/lora/README.md index b5d72403166f..14cd6cd9be56 100644 --- a/examples/research_projects/lora/README.md +++ b/examples/research_projects/lora/README.md @@ -19,7 +19,7 @@ on consumer GPUs like Tesla T4, Tesla V100. ### Training -First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions). +First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions). **___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___** @@ -27,7 +27,7 @@ First, you need to set up your development environment as is explained in the [i ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" ``` For this example we want to directly store the trained LoRA embeddings on the Hub, so diff --git a/examples/research_projects/lora/train_text_to_image_lora.py b/examples/research_projects/lora/train_text_to_image_lora.py index cf00bf270057..1ebc1422b064 100644 --- a/examples/research_projects/lora/train_text_to_image_lora.py +++ b/examples/research_projects/lora/train_text_to_image_lora.py @@ -387,7 +387,7 @@ def parse_args(): DATASET_NAME_MAPPING = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), + "lambdalabs/naruto-blip-captions": ("image", "text"), } diff --git a/examples/research_projects/onnxruntime/text_to_image/README.md b/examples/research_projects/onnxruntime/text_to_image/README.md index 48bce2065444..8b499795746c 100644 --- a/examples/research_projects/onnxruntime/text_to_image/README.md +++ b/examples/research_projects/onnxruntime/text_to_image/README.md @@ -55,7 +55,7 @@ The command to train a DDPM UNetCondition model on the Pokemon dataset with onnx ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export dataset_name="lambdalabs/pokemon-blip-captions" +export dataset_name="lambdalabs/naruto-blip-captions" accelerate launch --mixed_precision="fp16" train_text_to_image.py \ --pretrained_model_name_or_path=$MODEL_NAME \ --dataset_name=$dataset_name \ diff --git a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py index ee61f033d34d..126a10b4f9e9 100644 --- a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py +++ b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py @@ -59,7 +59,7 @@ logger = get_logger(__name__, log_level="INFO") DATASET_NAME_MAPPING = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), + "lambdalabs/naruto-blip-captions": ("image", "text"), } diff --git a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image.py b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image.py index 0f4cc6c50b5e..d3bf95305dad 100644 --- a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image.py +++ b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image.py @@ -61,7 +61,7 @@ logger = get_logger(__name__, log_level="INFO") DATASET_NAME_MAPPING = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), + "lambdalabs/naruto-blip-captions": ("image", "text"), } diff --git a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora.py b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora.py index f22519b02e2b..a4b4d69bb892 100644 --- a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora.py +++ b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora.py @@ -406,7 +406,7 @@ def parse_args(): DATASET_NAME_MAPPING = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), + "lambdalabs/naruto-blip-captions": ("image", "text"), } diff --git a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py index e5ff9d39e8ba..d7f2dcaa3442 100644 --- a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py +++ b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_lora_sdxl.py @@ -468,7 +468,7 @@ def parse_args(input_args=None): DATASET_NAME_MAPPING = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), + "lambdalabs/naruto-blip-captions": ("image", "text"), } diff --git a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_sdxl.py b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_sdxl.py index 1dac573fce4c..a056bcfc8cb1 100644 --- a/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_sdxl.py +++ b/examples/research_projects/scheduled_huber_loss_training/text_to_image/train_text_to_image_sdxl.py @@ -60,7 +60,7 @@ DATASET_NAME_MAPPING = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), + "lambdalabs/naruto-blip-captions": ("image", "text"), } diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md index fd6e50bc3710..9a8410604878 100644 --- a/examples/text_to_image/README.md +++ b/examples/text_to_image/README.md @@ -57,7 +57,7 @@ With `gradient_checkpointing` and `mixed_precision` it should be possible to fin ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" accelerate launch --mixed_precision="fp16" train_text_to_image.py \ --pretrained_model_name_or_path=$MODEL_NAME \ @@ -136,7 +136,7 @@ for running distributed training with `accelerate`. Here is an example command: ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" accelerate launch --mixed_precision="fp16" --multi_gpu train_text_to_image.py \ --pretrained_model_name_or_path=$MODEL_NAME \ @@ -192,7 +192,7 @@ on consumer GPUs like Tesla T4, Tesla V100. ### Training -First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions). +First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions). **___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___** @@ -200,7 +200,7 @@ First, you need to set up your development environment as is explained in the [i ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" ``` For this example we want to directly store the trained LoRA embeddings on the Hub, so @@ -282,7 +282,7 @@ pip install -U -r requirements_flax.txt ```bash export MODEL_NAME="duongna/stable-diffusion-v1-4-flax" -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" python train_text_to_image_flax.py \ --pretrained_model_name_or_path=$MODEL_NAME \ diff --git a/examples/text_to_image/README_sdxl.md b/examples/text_to_image/README_sdxl.md index 349feef5008e..35ea0091c4f3 100644 --- a/examples/text_to_image/README_sdxl.md +++ b/examples/text_to_image/README_sdxl.md @@ -52,7 +52,7 @@ Note also that we use PEFT library as backend for LoRA training, make sure to ha ```bash export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0" export VAE_NAME="madebyollin/sdxl-vae-fp16-fix" -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" accelerate launch train_text_to_image_sdxl.py \ --pretrained_model_name_or_path=$MODEL_NAME \ @@ -76,7 +76,7 @@ accelerate launch train_text_to_image_sdxl.py \ **Notes**: -* The `train_text_to_image_sdxl.py` script pre-computes text embeddings and the VAE encodings and keeps them in memory. While for smaller datasets like [`lambdalabs/pokemon-blip-captions`](https://hf.co/datasets/lambdalabs/pokemon-blip-captions), it might not be a problem, it can definitely lead to memory problems when the script is used on a larger dataset. For those purposes, you would want to serialize these pre-computed representations to disk separately and load them during the fine-tuning process. Refer to [this PR](https://github.com/huggingface/diffusers/pull/4505) for a more in-depth discussion. +* The `train_text_to_image_sdxl.py` script pre-computes text embeddings and the VAE encodings and keeps them in memory. While for smaller datasets like [`lambdalabs/naruto-blip-captions`](https://hf.co/datasets/lambdalabs/naruto-blip-captions), it might not be a problem, it can definitely lead to memory problems when the script is used on a larger dataset. For those purposes, you would want to serialize these pre-computed representations to disk separately and load them during the fine-tuning process. Refer to [this PR](https://github.com/huggingface/diffusers/pull/4505) for a more in-depth discussion. * The training script is compute-intensive and may not run on a consumer GPU like Tesla T4. * The training command shown above performs intermediate quality validation in between the training epochs and logs the results to Weights and Biases. `--report_to`, `--validation_prompt`, and `--validation_epochs` are the relevant CLI arguments here. * SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)). @@ -142,14 +142,14 @@ on consumer GPUs like Tesla T4, Tesla V100. ### Training -First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables and, optionally, the `VAE_NAME` variable. Here, we will use [Stable Diffusion XL 1.0-base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions). +First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables and, optionally, the `VAE_NAME` variable. Here, we will use [Stable Diffusion XL 1.0-base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions). **___Note: It is quite useful to monitor the training progress by regularly generating sample images during training. [Weights and Biases](https://docs.wandb.ai/quickstart) is a nice solution to easily see generating images during training. All you need to do is to run `pip install wandb` before training to automatically log images.___** ```bash export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0" export VAE_NAME="madebyollin/sdxl-vae-fp16-fix" -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" ``` For this example we want to directly store the trained LoRA embeddings on the Hub, so @@ -219,7 +219,7 @@ You need to save the mentioned configuration as an `accelerate_config.yaml` file ```shell export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0" export VAE_NAME="madebyollin/sdxl-vae-fp16-fix" -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" export ACCELERATE_CONFIG_FILE="your accelerate_config.yaml" accelerate launch --config_file $ACCELERATE_CONFIG_FILE train_text_to_image_lora_sdxl.py \ diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index aa704ba8ca38..13ee0f2cc4c7 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -62,7 +62,7 @@ logger = get_logger(__name__, log_level="INFO") DATASET_NAME_MAPPING = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), + "lambdalabs/naruto-blip-captions": ("image", "text"), } diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py index 557923c52e00..c3a08a90b4e5 100644 --- a/examples/text_to_image/train_text_to_image_flax.py +++ b/examples/text_to_image/train_text_to_image_flax.py @@ -250,7 +250,7 @@ def parse_args(): dataset_name_mapping = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), + "lambdalabs/naruto-blip-captions": ("image", "text"), } diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py index 7164ac909cb2..37b10cfd1bad 100644 --- a/examples/text_to_image/train_text_to_image_lora.py +++ b/examples/text_to_image/train_text_to_image_lora.py @@ -387,7 +387,7 @@ def parse_args(): DATASET_NAME_MAPPING = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), + "lambdalabs/naruto-blip-captions": ("image", "text"), } diff --git a/examples/text_to_image/train_text_to_image_lora_sdxl.py b/examples/text_to_image/train_text_to_image_lora_sdxl.py index 3604e755c62a..c9883252d14b 100644 --- a/examples/text_to_image/train_text_to_image_lora_sdxl.py +++ b/examples/text_to_image/train_text_to_image_lora_sdxl.py @@ -454,7 +454,7 @@ def parse_args(input_args=None): DATASET_NAME_MAPPING = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), + "lambdalabs/naruto-blip-captions": ("image", "text"), } diff --git a/examples/text_to_image/train_text_to_image_sdxl.py b/examples/text_to_image/train_text_to_image_sdxl.py index 88adbb995531..90602ad597a9 100644 --- a/examples/text_to_image/train_text_to_image_sdxl.py +++ b/examples/text_to_image/train_text_to_image_sdxl.py @@ -61,7 +61,7 @@ DATASET_NAME_MAPPING = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), + "lambdalabs/naruto-blip-captions": ("image", "text"), } diff --git a/examples/wuerstchen/text_to_image/README.md b/examples/wuerstchen/text_to_image/README.md index d655259088e4..7583296e66d1 100644 --- a/examples/wuerstchen/text_to_image/README.md +++ b/examples/wuerstchen/text_to_image/README.md @@ -37,7 +37,7 @@ You can fine-tune the Würstchen prior model with the `train_text_to_image_prior ```bash -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" accelerate launch train_text_to_image_prior.py \ --mixed_precision="fp16" \ @@ -72,10 +72,10 @@ In a nutshell, LoRA allows adapting pretrained models by adding pairs of rank-de ### Prior Training -First, you need to set up your development environment as explained in the [installation](#Running-locally-with-PyTorch) section. Make sure to set the `DATASET_NAME` environment variable. Here, we will use the [Pokemon captions dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions). +First, you need to set up your development environment as explained in the [installation](#Running-locally-with-PyTorch) section. Make sure to set the `DATASET_NAME` environment variable. Here, we will use the [Pokemon captions dataset](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions). ```bash -export DATASET_NAME="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/naruto-blip-captions" accelerate launch train_text_to_image_lora_prior.py \ --mixed_precision="fp16" \ diff --git a/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py b/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py index 76eaf6423960..79f7d8576ff4 100644 --- a/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py +++ b/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py @@ -55,7 +55,7 @@ logger = get_logger(__name__, log_level="INFO") DATASET_NAME_MAPPING = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), + "lambdalabs/naruto-blip-captions": ("image", "text"), } diff --git a/examples/wuerstchen/text_to_image/train_text_to_image_prior.py b/examples/wuerstchen/text_to_image/train_text_to_image_prior.py index 49cc5d26072d..3e0acfdaf519 100644 --- a/examples/wuerstchen/text_to_image/train_text_to_image_prior.py +++ b/examples/wuerstchen/text_to_image/train_text_to_image_prior.py @@ -56,7 +56,7 @@ logger = get_logger(__name__, log_level="INFO") DATASET_NAME_MAPPING = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), + "lambdalabs/naruto-blip-captions": ("image", "text"), } From c2217142bd8653116a6d0adf75de2cc36c37b0a1 Mon Sep 17 00:00:00 2001 From: Hyoungwon Cho Date: Wed, 8 May 2024 11:35:15 +0900 Subject: [PATCH 49/56] Modification on the PAG community pipeline (re) (#7876) * edited_pag_implementation * update --------- Co-authored-by: yiyixuxu --- .../pipeline_stable_diffusion_pag.py | 95 ++++++++----------- 1 file changed, 42 insertions(+), 53 deletions(-) diff --git a/examples/community/pipeline_stable_diffusion_pag.py b/examples/community/pipeline_stable_diffusion_pag.py index 04f38a888460..cdb7bd99cb29 100644 --- a/examples/community/pipeline_stable_diffusion_pag.py +++ b/examples/community/pipeline_stable_diffusion_pag.py @@ -1,4 +1,5 @@ -# Implementation of StableDiffusionPAGPipeline +# Implementation of StableDiffusionPipeline with PAG +# https://ku-cvlab.github.io/Perturbed-Attention-Guidance import inspect from typing import Any, Callable, Dict, List, Optional, Union @@ -134,8 +135,8 @@ def __call__( value = attn.to_v(hidden_states_ptb) - hidden_states_ptb = torch.zeros(value.shape).to(value.get_device()) - # hidden_states_ptb = value + # hidden_states_ptb = torch.zeros(value.shape).to(value.get_device()) + hidden_states_ptb = value hidden_states_ptb = hidden_states_ptb.to(query.dtype) @@ -1045,7 +1046,7 @@ def pag_scale(self): return self._pag_scale @property - def do_adversarial_guidance(self): + def do_perturbed_attention_guidance(self): return self._pag_scale > 0 @property @@ -1056,14 +1057,6 @@ def pag_adaptive_scaling(self): def do_pag_adaptive_scaling(self): return self._pag_adaptive_scaling > 0 - @property - def pag_drop_rate(self): - return self._pag_drop_rate - - @property - def pag_applied_layers(self): - return self._pag_applied_layers - @property def pag_applied_layers_index(self): return self._pag_applied_layers_index @@ -1080,8 +1073,6 @@ def __call__( guidance_scale: float = 7.5, pag_scale: float = 0.0, pag_adaptive_scaling: float = 0.0, - pag_drop_rate: float = 0.5, - pag_applied_layers: List[str] = ["down"], # ['down', 'mid', 'up'] pag_applied_layers_index: List[str] = ["d4"], # ['d4', 'd5', 'm0'] negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, @@ -1221,8 +1212,6 @@ def __call__( self._pag_scale = pag_scale self._pag_adaptive_scaling = pag_adaptive_scaling - self._pag_drop_rate = pag_drop_rate - self._pag_applied_layers = pag_applied_layers self._pag_applied_layers_index = pag_applied_layers_index # 2. Define call parameters @@ -1257,13 +1246,13 @@ def __call__( # to avoid doing two forward passes # cfg - if self.do_classifier_free_guidance and not self.do_adversarial_guidance: + if self.do_classifier_free_guidance and not self.do_perturbed_attention_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) # pag - elif not self.do_classifier_free_guidance and self.do_adversarial_guidance: + elif not self.do_classifier_free_guidance and self.do_perturbed_attention_guidance: prompt_embeds = torch.cat([prompt_embeds, prompt_embeds]) # both - elif self.do_classifier_free_guidance and self.do_adversarial_guidance: + elif self.do_classifier_free_guidance and self.do_perturbed_attention_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds, prompt_embeds]) if ip_adapter_image is not None or ip_adapter_image_embeds is not None: @@ -1306,7 +1295,7 @@ def __call__( ).to(device=device, dtype=latents.dtype) # 7. Denoising loop - if self.do_adversarial_guidance: + if self.do_perturbed_attention_guidance: down_layers = [] mid_layers = [] up_layers = [] @@ -1322,6 +1311,29 @@ def __call__( else: raise ValueError(f"Invalid layer type: {layer_type}") + # change attention layer in UNet if use PAG + if self.do_perturbed_attention_guidance: + if self.do_classifier_free_guidance: + replace_processor = PAGCFGIdentitySelfAttnProcessor() + else: + replace_processor = PAGIdentitySelfAttnProcessor() + + drop_layers = self.pag_applied_layers_index + for drop_layer in drop_layers: + try: + if drop_layer[0] == "d": + down_layers[int(drop_layer[1])].processor = replace_processor + elif drop_layer[0] == "m": + mid_layers[int(drop_layer[1])].processor = replace_processor + elif drop_layer[0] == "u": + up_layers[int(drop_layer[1])].processor = replace_processor + else: + raise ValueError(f"Invalid layer type: {drop_layer[0]}") + except IndexError: + raise ValueError( + f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers." + ) + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order self._num_timesteps = len(timesteps) with self.progress_bar(total=num_inference_steps) as progress_bar: @@ -1330,41 +1342,18 @@ def __call__( continue # cfg - if self.do_classifier_free_guidance and not self.do_adversarial_guidance: + if self.do_classifier_free_guidance and not self.do_perturbed_attention_guidance: latent_model_input = torch.cat([latents] * 2) # pag - elif not self.do_classifier_free_guidance and self.do_adversarial_guidance: + elif not self.do_classifier_free_guidance and self.do_perturbed_attention_guidance: latent_model_input = torch.cat([latents] * 2) # both - elif self.do_classifier_free_guidance and self.do_adversarial_guidance: + elif self.do_classifier_free_guidance and self.do_perturbed_attention_guidance: latent_model_input = torch.cat([latents] * 3) # no else: latent_model_input = latents - # change attention layer in UNet if use PAG - if self.do_adversarial_guidance: - if self.do_classifier_free_guidance: - replace_processor = PAGCFGIdentitySelfAttnProcessor() - else: - replace_processor = PAGIdentitySelfAttnProcessor() - - drop_layers = self.pag_applied_layers_index - for drop_layer in drop_layers: - try: - if drop_layer[0] == "d": - down_layers[int(drop_layer[1])].processor = replace_processor - elif drop_layer[0] == "m": - mid_layers[int(drop_layer[1])].processor = replace_processor - elif drop_layer[0] == "u": - up_layers[int(drop_layer[1])].processor = replace_processor - else: - raise ValueError(f"Invalid layer type: {drop_layer[0]}") - except IndexError: - raise ValueError( - f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers." - ) - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual @@ -1381,14 +1370,14 @@ def __call__( # perform guidance # cfg - if self.do_classifier_free_guidance and not self.do_adversarial_guidance: + if self.do_classifier_free_guidance and not self.do_perturbed_attention_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) delta = noise_pred_text - noise_pred_uncond noise_pred = noise_pred_uncond + self.guidance_scale * delta # pag - elif not self.do_classifier_free_guidance and self.do_adversarial_guidance: + elif not self.do_classifier_free_guidance and self.do_perturbed_attention_guidance: noise_pred_original, noise_pred_perturb = noise_pred.chunk(2) signal_scale = self.pag_scale @@ -1400,7 +1389,7 @@ def __call__( noise_pred = noise_pred_original + signal_scale * (noise_pred_original - noise_pred_perturb) # both - elif self.do_classifier_free_guidance and self.do_adversarial_guidance: + elif self.do_classifier_free_guidance and self.do_perturbed_attention_guidance: noise_pred_uncond, noise_pred_text, noise_pred_text_perturb = noise_pred.chunk(3) signal_scale = self.pag_scale @@ -1458,11 +1447,8 @@ def __call__( # Offload all models self.maybe_free_model_hooks() - if not return_dict: - return (image, has_nsfw_concept) - # change attention layer in UNet if use PAG - if self.do_adversarial_guidance: + if self.do_perturbed_attention_guidance: drop_layers = self.pag_applied_layers_index for drop_layer in drop_layers: try: @@ -1479,4 +1465,7 @@ def __call__( f"Invalid layer index: {drop_layer}. Available layers: {len(down_layers)} down layers, {len(mid_layers)} mid layers, {len(up_layers)} up layers." ) + if not return_dict: + return (image, has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) From d50baf0c632342b9576a24352244c4235ce8b875 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tolga=20Cang=C3=B6z?= <46008593+standardAI@users.noreply.github.com> Date: Wed, 8 May 2024 05:45:02 +0300 Subject: [PATCH 50/56] Fix image upcasting (#7858) Fix image's upcasting before `vae.encode()` when using `fp16` Co-authored-by: YiYi Xu --- .../ledits_pp/pipeline_leditspp_stable_diffusion_xl.py | 1 - .../pipeline_stable_diffusion_xl_instruct_pix2pix.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py index cfab70926a4a..5ea7c2c14551 100644 --- a/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py @@ -1419,7 +1419,6 @@ def encode_image(self, image, dtype=None, height=None, width=None, resize_mode=" if needs_upcasting: image = image.float() self.upcast_vae() - image = image.to(next(iter(self.vae.post_quant_conv.parameters())).dtype) x0 = self.vae.encode(image).latent_dist.mode() x0 = x0.to(dtype) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py index 5e7be370be01..d9380020b329 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py @@ -525,8 +525,8 @@ def prepare_image_latents( # make sure the VAE is in float32 mode, as it overflows in float16 needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast if needs_upcasting: + image = image.float() self.upcast_vae() - image = image.to(next(iter(self.vae.post_quant_conv.parameters())).dtype) image_latents = retrieve_latents(self.vae.encode(image), sample_mode="argmax") From f29b93488dee9af000fab6e7bdb68ab565d50564 Mon Sep 17 00:00:00 2001 From: Philip Pham Date: Wed, 8 May 2024 07:57:19 -0400 Subject: [PATCH 51/56] Check shape and remove deprecated APIs in scheduling_ddpm_flax.py (#7703) `model_output.shape` may only have rank 1. There are warnings related to use of random keys. ``` tests/schedulers/test_scheduler_flax.py: 13 warnings /Users/phillypham/diffusers/src/diffusers/schedulers/scheduling_ddpm_flax.py:268: FutureWarning: normal accepts a single key, but was given a key array of shape (1, 2) != (). Use jax.vmap for batching. In a future JAX version, this will be an error. noise = jax.random.normal(split_key, shape=model_output.shape, dtype=self.dtype) tests/schedulers/test_scheduler_flax.py::FlaxDDPMSchedulerTest::test_betas /Users/phillypham/virtualenv/diffusers/lib/python3.9/site-packages/jax/_src/random.py:731: FutureWarning: uniform accepts a single key, but was given a key array of shape (1,) != (). Use jax.vmap for batching. In a future JAX version, this will be an error. u = uniform(key, shape, dtype, lo, hi) # type: ignore[arg-type] ``` --- src/diffusers/schedulers/scheduling_ddpm_flax.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/diffusers/schedulers/scheduling_ddpm_flax.py b/src/diffusers/schedulers/scheduling_ddpm_flax.py index 6bdfa5eb5e0d..d06a171159ee 100644 --- a/src/diffusers/schedulers/scheduling_ddpm_flax.py +++ b/src/diffusers/schedulers/scheduling_ddpm_flax.py @@ -222,9 +222,13 @@ def step( t = timestep if key is None: - key = jax.random.PRNGKey(0) + key = jax.random.key(0) - if model_output.shape[1] == sample.shape[1] * 2 and self.config.variance_type in ["learned", "learned_range"]: + if ( + len(model_output.shape) > 1 + and model_output.shape[1] == sample.shape[1] * 2 + and self.config.variance_type in ["learned", "learned_range"] + ): model_output, predicted_variance = jnp.split(model_output, sample.shape[1], axis=1) else: predicted_variance = None @@ -264,7 +268,7 @@ def step( # 6. Add noise def random_variance(): - split_key = jax.random.split(key, num=1) + split_key = jax.random.split(key, num=1)[0] noise = jax.random.normal(split_key, shape=model_output.shape, dtype=self.dtype) return (self._get_variance(state, t, predicted_variance=predicted_variance) ** 0.5) * noise From 818f760732aa541438279055d133b6afb7128311 Mon Sep 17 00:00:00 2001 From: Aryan Date: Wed, 8 May 2024 21:27:14 +0530 Subject: [PATCH 52/56] [Pipeline] AnimateDiff SDXL (#6721) * update conversion script to handle motion adapter sdxl checkpoint * add animatediff xl * handle addition_embed_type * fix output * update * add imports * make fix-copies * add decode latents * update docstrings * add animatediff sdxl to docs * remove unnecessary lines * update example * add test * revert conv_in conv_out kernel param * remove unused param addition_embed_type_num_heads * latest IPAdapter impl * make fix-copies * fix return * add IPAdapterTesterMixin to tests * fix return * revert based on suggestion * add freeinit * fix test_to_dtype test * use StableDiffusionMixin instead of different helper methods * fix progress bar iterations * apply suggestions from review * hardcode flip_sin_to_cos and freq_shift * make fix-copies * fix ip adapter implementation * fix last failing test * make style * Update docs/source/en/api/pipelines/animatediff.md Co-authored-by: Dhruv Nair * remove todo * fix doc-builder errors --------- Co-authored-by: Dhruv Nair --- docs/source/en/api/pipelines/animatediff.md | 53 + ..._animatediff_motion_module_to_diffusers.py | 7 +- src/diffusers/__init__.py | 2 + src/diffusers/models/unets/unet_3d_blocks.py | 2 + .../models/unets/unet_motion_model.py | 74 +- src/diffusers/pipelines/__init__.py | 3 +- .../pipelines/animatediff/__init__.py | 2 + .../animatediff/pipeline_animatediff_sdxl.py | 1284 +++++++++++++++++ .../dummy_torch_and_transformers_objects.py | 15 + .../animatediff/test_animatediff_sdxl.py | 307 ++++ 10 files changed, 1740 insertions(+), 9 deletions(-) create mode 100644 src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py create mode 100644 tests/pipelines/animatediff/test_animatediff_sdxl.py diff --git a/docs/source/en/api/pipelines/animatediff.md b/docs/source/en/api/pipelines/animatediff.md index 913529e6ebdc..425764541590 100644 --- a/docs/source/en/api/pipelines/animatediff.md +++ b/docs/source/en/api/pipelines/animatediff.md @@ -101,6 +101,53 @@ AnimateDiff tends to work better with finetuned Stable Diffusion models. If you
+### AnimateDiffSDXLPipeline + +AnimateDiff can also be used with SDXL models. This is currently an experimental feature as only a beta release of the motion adapter checkpoint is available. + +```python +import torch +from diffusers.models import MotionAdapter +from diffusers import AnimateDiffSDXLPipeline, DDIMScheduler +from diffusers.utils import export_to_gif + +adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-sdxl-beta", torch_dtype=torch.float16) + +model_id = "stabilityai/stable-diffusion-xl-base-1.0" +scheduler = DDIMScheduler.from_pretrained( + model_id, + subfolder="scheduler", + clip_sample=False, + timestep_spacing="linspace", + beta_schedule="linear", + steps_offset=1, +) +pipe = AnimateDiffSDXLPipeline.from_pretrained( + model_id, + motion_adapter=adapter, + scheduler=scheduler, + torch_dtype=torch.float16, + variant="fp16", +).to("cuda") + +# enable memory savings +pipe.enable_vae_slicing() +pipe.enable_vae_tiling() + +output = pipe( + prompt="a panda surfing in the ocean, realistic, high quality", + negative_prompt="low quality, worst quality", + num_inference_steps=20, + guidance_scale=8, + width=1024, + height=1024, + num_frames=16, +) + +frames = output.frames[0] +export_to_gif(frames, "animation.gif") +``` + ### AnimateDiffVideoToVideoPipeline AnimateDiff can also be used to generate visually similar videos or enable style/character/background or other edits starting from an initial video, allowing you to seamlessly explore creative possibilities. @@ -522,6 +569,12 @@ export_to_gif(frames, "animatelcm-motion-lora.gif") - all - __call__ +## AnimateDiffSDXLPipeline + +[[autodoc]] AnimateDiffSDXLPipeline + - all + - __call__ + ## AnimateDiffVideoToVideoPipeline [[autodoc]] AnimateDiffVideoToVideoPipeline diff --git a/scripts/convert_animatediff_motion_module_to_diffusers.py b/scripts/convert_animatediff_motion_module_to_diffusers.py index e8fb007243fd..e188a6a533e8 100644 --- a/scripts/convert_animatediff_motion_module_to_diffusers.py +++ b/scripts/convert_animatediff_motion_module_to_diffusers.py @@ -31,6 +31,7 @@ def get_args(): parser.add_argument("--output_path", type=str, required=True) parser.add_argument("--use_motion_mid_block", action="store_true") parser.add_argument("--motion_max_seq_length", type=int, default=32) + parser.add_argument("--block_out_channels", nargs="+", default=[320, 640, 1280, 1280], type=int) parser.add_argument("--save_fp16", action="store_true") return parser.parse_args() @@ -49,11 +50,13 @@ def get_args(): conv_state_dict = convert_motion_module(state_dict) adapter = MotionAdapter( - use_motion_mid_block=args.use_motion_mid_block, motion_max_seq_length=args.motion_max_seq_length + block_out_channels=args.block_out_channels, + use_motion_mid_block=args.use_motion_mid_block, + motion_max_seq_length=args.motion_max_seq_length, ) # skip loading position embeddings adapter.load_state_dict(conv_state_dict, strict=False) adapter.save_pretrained(args.output_path) if args.save_fp16: - adapter.to(torch.float16).save_pretrained(args.output_path, variant="fp16") + adapter.to(dtype=torch.float16).save_pretrained(args.output_path, variant="fp16") diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index d6aa1e11b6d2..f7fa82157c52 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -216,6 +216,7 @@ "AmusedInpaintPipeline", "AmusedPipeline", "AnimateDiffPipeline", + "AnimateDiffSDXLPipeline", "AnimateDiffVideoToVideoPipeline", "AudioLDM2Pipeline", "AudioLDM2ProjectionModel", @@ -595,6 +596,7 @@ AmusedInpaintPipeline, AmusedPipeline, AnimateDiffPipeline, + AnimateDiffSDXLPipeline, AnimateDiffVideoToVideoPipeline, AudioLDM2Pipeline, AudioLDM2ProjectionModel, diff --git a/src/diffusers/models/unets/unet_3d_blocks.py b/src/diffusers/models/unets/unet_3d_blocks.py index 97c91f61da1c..75827258f64c 100644 --- a/src/diffusers/models/unets/unet_3d_blocks.py +++ b/src/diffusers/models/unets/unet_3d_blocks.py @@ -121,6 +121,7 @@ def get_down_block( raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockMotion") return CrossAttnDownBlockMotion( num_layers=num_layers, + transformer_layers_per_block=transformer_layers_per_block, in_channels=in_channels, out_channels=out_channels, temb_channels=temb_channels, @@ -255,6 +256,7 @@ def get_up_block( raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockMotion") return CrossAttnUpBlockMotion( num_layers=num_layers, + transformer_layers_per_block=transformer_layers_per_block, in_channels=in_channels, out_channels=out_channels, prev_output_channel=prev_output_channel, diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py index 595b7b03571c..81cc4b1f7ad4 100644 --- a/src/diffusers/models/unets/unet_motion_model.py +++ b/src/diffusers/models/unets/unet_motion_model.py @@ -211,6 +211,8 @@ def __init__( norm_num_groups: int = 32, norm_eps: float = 1e-5, cross_attention_dim: int = 1280, + transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1, + reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None, use_linear_projection: bool = False, num_attention_heads: Union[int, Tuple[int, ...]] = 8, motion_max_seq_length: int = 32, @@ -218,6 +220,9 @@ def __init__( use_motion_mid_block: int = True, encoder_hid_dim: Optional[int] = None, encoder_hid_dim_type: Optional[str] = None, + addition_embed_type: Optional[str] = None, + addition_time_embed_dim: Optional[int] = None, + projection_class_embeddings_input_dim: Optional[int] = None, time_cond_proj_dim: Optional[int] = None, ): super().__init__() @@ -240,6 +245,21 @@ def __init__( f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}." ) + if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}." + ) + + if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}." + ) + + if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None: + for layer_number_per_block in transformer_layers_per_block: + if isinstance(layer_number_per_block, list): + raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.") + # input conv_in_kernel = 3 conv_out_kernel = 3 @@ -260,6 +280,10 @@ def __init__( if encoder_hid_dim_type is None: self.encoder_hid_proj = None + if addition_embed_type == "text_time": + self.add_time_proj = Timesteps(addition_time_embed_dim, True, 0) + self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim) + # class embedding self.down_blocks = nn.ModuleList([]) self.up_blocks = nn.ModuleList([]) @@ -267,6 +291,15 @@ def __init__( if isinstance(num_attention_heads, int): num_attention_heads = (num_attention_heads,) * len(down_block_types) + if isinstance(cross_attention_dim, int): + cross_attention_dim = (cross_attention_dim,) * len(down_block_types) + + if isinstance(layers_per_block, int): + layers_per_block = [layers_per_block] * len(down_block_types) + + if isinstance(transformer_layers_per_block, int): + transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types) + # down output_channel = block_out_channels[0] for i, down_block_type in enumerate(down_block_types): @@ -276,7 +309,7 @@ def __init__( down_block = get_down_block( down_block_type, - num_layers=layers_per_block, + num_layers=layers_per_block[i], in_channels=input_channel, out_channels=output_channel, temb_channels=time_embed_dim, @@ -284,13 +317,14 @@ def __init__( resnet_eps=norm_eps, resnet_act_fn=act_fn, resnet_groups=norm_num_groups, - cross_attention_dim=cross_attention_dim, + cross_attention_dim=cross_attention_dim[i], num_attention_heads=num_attention_heads[i], downsample_padding=downsample_padding, use_linear_projection=use_linear_projection, dual_cross_attention=False, temporal_num_attention_heads=motion_num_attention_heads, temporal_max_seq_length=motion_max_seq_length, + transformer_layers_per_block=transformer_layers_per_block[i], ) self.down_blocks.append(down_block) @@ -302,13 +336,14 @@ def __init__( resnet_eps=norm_eps, resnet_act_fn=act_fn, output_scale_factor=mid_block_scale_factor, - cross_attention_dim=cross_attention_dim, + cross_attention_dim=cross_attention_dim[-1], num_attention_heads=num_attention_heads[-1], resnet_groups=norm_num_groups, dual_cross_attention=False, use_linear_projection=use_linear_projection, temporal_num_attention_heads=motion_num_attention_heads, temporal_max_seq_length=motion_max_seq_length, + transformer_layers_per_block=transformer_layers_per_block[-1], ) else: @@ -318,11 +353,12 @@ def __init__( resnet_eps=norm_eps, resnet_act_fn=act_fn, output_scale_factor=mid_block_scale_factor, - cross_attention_dim=cross_attention_dim, + cross_attention_dim=cross_attention_dim[-1], num_attention_heads=num_attention_heads[-1], resnet_groups=norm_num_groups, dual_cross_attention=False, use_linear_projection=use_linear_projection, + transformer_layers_per_block=transformer_layers_per_block[-1], ) # count how many layers upsample the images @@ -331,6 +367,9 @@ def __init__( # up reversed_block_out_channels = list(reversed(block_out_channels)) reversed_num_attention_heads = list(reversed(num_attention_heads)) + reversed_layers_per_block = list(reversed(layers_per_block)) + reversed_cross_attention_dim = list(reversed(cross_attention_dim)) + reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block)) output_channel = reversed_block_out_channels[0] for i, up_block_type in enumerate(up_block_types): @@ -349,7 +388,7 @@ def __init__( up_block = get_up_block( up_block_type, - num_layers=layers_per_block + 1, + num_layers=reversed_layers_per_block[i] + 1, in_channels=input_channel, out_channels=output_channel, prev_output_channel=prev_output_channel, @@ -358,13 +397,14 @@ def __init__( resnet_eps=norm_eps, resnet_act_fn=act_fn, resnet_groups=norm_num_groups, - cross_attention_dim=cross_attention_dim, + cross_attention_dim=reversed_cross_attention_dim[i], num_attention_heads=reversed_num_attention_heads[i], dual_cross_attention=False, resolution_idx=i, use_linear_projection=use_linear_projection, temporal_num_attention_heads=motion_num_attention_heads, temporal_max_seq_length=motion_max_seq_length, + transformer_layers_per_block=reversed_transformer_layers_per_block[i], ) self.up_blocks.append(up_block) prev_output_channel = output_channel @@ -835,6 +875,28 @@ def forward( t_emb = t_emb.to(dtype=self.dtype) emb = self.time_embedding(t_emb, timestep_cond) + aug_emb = None + + if self.config.addition_embed_type == "text_time": + if "text_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`" + ) + + text_embeds = added_cond_kwargs.get("text_embeds") + if "time_ids" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`" + ) + time_ids = added_cond_kwargs.get("time_ids") + time_embeds = self.add_time_proj(time_ids.flatten()) + time_embeds = time_embeds.reshape((text_embeds.shape[0], -1)) + + add_embeds = torch.concat([text_embeds, time_embeds], dim=-1) + add_embeds = add_embeds.to(emb.dtype) + aug_emb = self.add_embedding(add_embeds) + + emb = emb if aug_emb is None else emb + aug_emb emb = emb.repeat_interleave(repeats=num_frames, dim=0) encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0) diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 169e44c9e3cd..c2dd7ac0d551 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -114,6 +114,7 @@ _import_structure["amused"] = ["AmusedImg2ImgPipeline", "AmusedInpaintPipeline", "AmusedPipeline"] _import_structure["animatediff"] = [ "AnimateDiffPipeline", + "AnimateDiffSDXLPipeline", "AnimateDiffVideoToVideoPipeline", ] _import_structure["audioldm"] = ["AudioLDMPipeline"] @@ -367,7 +368,7 @@ from ..utils.dummy_torch_and_transformers_objects import * else: from .amused import AmusedImg2ImgPipeline, AmusedInpaintPipeline, AmusedPipeline - from .animatediff import AnimateDiffPipeline, AnimateDiffVideoToVideoPipeline + from .animatediff import AnimateDiffPipeline, AnimateDiffSDXLPipeline, AnimateDiffVideoToVideoPipeline from .audioldm import AudioLDMPipeline from .audioldm2 import ( AudioLDM2Pipeline, diff --git a/src/diffusers/pipelines/animatediff/__init__.py b/src/diffusers/pipelines/animatediff/__init__.py index 35b99a76fd21..ae6b67b1924c 100644 --- a/src/diffusers/pipelines/animatediff/__init__.py +++ b/src/diffusers/pipelines/animatediff/__init__.py @@ -22,6 +22,7 @@ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) else: _import_structure["pipeline_animatediff"] = ["AnimateDiffPipeline"] + _import_structure["pipeline_animatediff_sdxl"] = ["AnimateDiffSDXLPipeline"] _import_structure["pipeline_animatediff_video2video"] = ["AnimateDiffVideoToVideoPipeline"] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: @@ -33,6 +34,7 @@ else: from .pipeline_animatediff import AnimateDiffPipeline + from .pipeline_animatediff_sdxl import AnimateDiffSDXLPipeline from .pipeline_animatediff_video2video import AnimateDiffVideoToVideoPipeline from .pipeline_output import AnimateDiffPipelineOutput diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py new file mode 100644 index 000000000000..f15cd5dbebf7 --- /dev/null +++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py @@ -0,0 +1,1284 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import numpy as np +import torch +from transformers import ( + CLIPImageProcessor, + CLIPTextModel, + CLIPTextModelWithProjection, + CLIPTokenizer, + CLIPVisionModelWithProjection, +) + +from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...loaders import ( + FromSingleFileMixin, + IPAdapterMixin, + StableDiffusionXLLoraLoaderMixin, + TextualInversionLoaderMixin, +) +from ...models import AutoencoderKL, ImageProjection, MotionAdapter, UNet2DConditionModel, UNetMotionModel +from ...models.attention_processor import ( + AttnProcessor2_0, + FusedAttnProcessor2_0, + LoRAAttnProcessor2_0, + LoRAXFormersAttnProcessor, + XFormersAttnProcessor, +) +from ...models.lora import adjust_lora_scale_text_encoder +from ...schedulers import ( + DDIMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, +) +from ...utils import ( + USE_PEFT_BACKEND, + logging, + replace_example_docstring, + scale_lora_layers, + unscale_lora_layers, +) +from ...utils.torch_utils import randn_tensor +from ..free_init_utils import FreeInitMixin +from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin +from .pipeline_output import AnimateDiffPipelineOutput + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers.models import MotionAdapter + >>> from diffusers import AnimateDiffSDXLPipeline, DDIMScheduler + >>> from diffusers.utils import export_to_gif + + >>> adapter = MotionAdapter.from_pretrained( + ... "a-r-r-o-w/animatediff-motion-adapter-sdxl-beta", torch_dtype=torch.float16 + ... ) + + >>> model_id = "stabilityai/stable-diffusion-xl-base-1.0" + >>> scheduler = DDIMScheduler.from_pretrained( + ... model_id, + ... subfolder="scheduler", + ... clip_sample=False, + ... timestep_spacing="linspace", + ... beta_schedule="linear", + ... steps_offset=1, + ... ) + >>> pipe = AnimateDiffSDXLPipeline.from_pretrained( + ... model_id, + ... motion_adapter=adapter, + ... scheduler=scheduler, + ... torch_dtype=torch.float16, + ... variant="fp16", + ... ).to("cuda") + + >>> # enable memory savings + >>> pipe.enable_vae_slicing() + >>> pipe.enable_vae_tiling() + + >>> output = pipe( + ... prompt="a panda surfing in the ocean, realistic, high quality", + ... negative_prompt="low quality, worst quality", + ... num_inference_steps=20, + ... guidance_scale=8, + ... width=1024, + ... height=1024, + ... num_frames=16, + ... ) + + >>> frames = output.frames[0] + >>> export_to_gif(frames, "animation.gif") + ``` +""" + + +# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid +def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"): + batch_size, channels, num_frames, height, width = video.shape + outputs = [] + for batch_idx in range(batch_size): + batch_vid = video[batch_idx].permute(1, 0, 2, 3) + batch_output = processor.postprocess(batch_vid, output_type) + + outputs.append(batch_output) + + if output_type == "np": + outputs = np.stack(outputs) + + elif output_type == "pt": + outputs = torch.stack(outputs) + + elif not output_type == "pil": + raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']") + + return outputs + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg +def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): + """ + Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 + """ + std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True) + std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) + # rescale the results from guidance (fixes overexposure) + noise_pred_rescaled = noise_cfg * (std_text / std_cfg) + # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images + noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg + return noise_cfg + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + timesteps: Optional[List[int]] = None, + **kwargs, +): + """ + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default + timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` + must be `None`. + + Returns: + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +class AnimateDiffSDXLPipeline( + DiffusionPipeline, + StableDiffusionMixin, + FromSingleFileMixin, + StableDiffusionXLLoraLoaderMixin, + TextualInversionLoaderMixin, + IPAdapterMixin, + FreeInitMixin, +): + r""" + Pipeline for text-to-video generation using Stable Diffusion XL. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + The pipeline also inherits the following loading methods: + - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings + - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files + - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion XL uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + text_encoder_2 ([` CLIPTextModelWithProjection`]): + Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection), + specifically the + [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k) + variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + tokenizer_2 (`CLIPTokenizer`): + Second Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): + Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`): + Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of + `stabilityai/stable-diffusion-xl-base-1-0`. + """ + + model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae" + _optional_components = [ + "tokenizer", + "tokenizer_2", + "text_encoder", + "text_encoder_2", + "image_encoder", + "feature_extractor", + ] + _callback_tensor_inputs = [ + "latents", + "prompt_embeds", + "negative_prompt_embeds", + "add_text_embeds", + "add_time_ids", + "negative_pooled_prompt_embeds", + "negative_add_time_ids", + ] + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + text_encoder_2: CLIPTextModelWithProjection, + tokenizer: CLIPTokenizer, + tokenizer_2: CLIPTokenizer, + unet: Union[UNet2DConditionModel, UNetMotionModel], + motion_adapter: MotionAdapter, + scheduler: Union[ + DDIMScheduler, + PNDMScheduler, + LMSDiscreteScheduler, + EulerDiscreteScheduler, + EulerAncestralDiscreteScheduler, + DPMSolverMultistepScheduler, + ], + image_encoder: CLIPVisionModelWithProjection = None, + feature_extractor: CLIPImageProcessor = None, + force_zeros_for_empty_prompt: bool = True, + ): + super().__init__() + + if isinstance(unet, UNet2DConditionModel): + unet = UNetMotionModel.from_unet2d(unet, motion_adapter) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + text_encoder_2=text_encoder_2, + tokenizer=tokenizer, + tokenizer_2=tokenizer_2, + unet=unet, + motion_adapter=motion_adapter, + scheduler=scheduler, + image_encoder=image_encoder, + feature_extractor=feature_extractor, + ) + self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + + self.default_sample_size = self.unet.config.sample_size + + # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt with num_images_per_prompt->num_videos_per_prompt + def encode_prompt( + self, + prompt: str, + prompt_2: Optional[str] = None, + device: Optional[torch.device] = None, + num_videos_per_prompt: int = 1, + do_classifier_free_guidance: bool = True, + negative_prompt: Optional[str] = None, + negative_prompt_2: Optional[str] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + clip_skip: Optional[int] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is + used in both text-encoders + device: (`torch.device`): + torch device + num_videos_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + negative_prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and + `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided, pooled text embeddings will be generated from `prompt` input argument. + negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` + input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + """ + device = device or self._execution_device + + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin): + self._lora_scale = lora_scale + + # dynamically adjust the LoRA scale + if self.text_encoder is not None: + if not USE_PEFT_BACKEND: + adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) + else: + scale_lora_layers(self.text_encoder, lora_scale) + + if self.text_encoder_2 is not None: + if not USE_PEFT_BACKEND: + adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale) + else: + scale_lora_layers(self.text_encoder_2, lora_scale) + + prompt = [prompt] if isinstance(prompt, str) else prompt + + if prompt is not None: + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + # Define tokenizers and text encoders + tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2] + text_encoders = ( + [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2] + ) + + if prompt_embeds is None: + prompt_2 = prompt_2 or prompt + prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2 + + # textual inversion: process multi-vector tokens if necessary + prompt_embeds_list = [] + prompts = [prompt, prompt_2] + for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders): + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, tokenizer) + + text_inputs = tokenizer( + prompt, + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + + text_input_ids = text_inputs.input_ids + untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {tokenizer.model_max_length} tokens: {removed_text}" + ) + + prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True) + + # We are only ALWAYS interested in the pooled output of the final text encoder + pooled_prompt_embeds = prompt_embeds[0] + if clip_skip is None: + prompt_embeds = prompt_embeds.hidden_states[-2] + else: + # "2" because SDXL always indexes from the penultimate layer. + prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)] + + prompt_embeds_list.append(prompt_embeds) + + prompt_embeds = torch.concat(prompt_embeds_list, dim=-1) + + # get unconditional embeddings for classifier free guidance + zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt + if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt: + negative_prompt_embeds = torch.zeros_like(prompt_embeds) + negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds) + elif do_classifier_free_guidance and negative_prompt_embeds is None: + negative_prompt = negative_prompt or "" + negative_prompt_2 = negative_prompt_2 or negative_prompt + + # normalize str to list + negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt + negative_prompt_2 = ( + batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2 + ) + + uncond_tokens: List[str] + if prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = [negative_prompt, negative_prompt_2] + + negative_prompt_embeds_list = [] + for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders): + if isinstance(self, TextualInversionLoaderMixin): + negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer) + + max_length = prompt_embeds.shape[1] + uncond_input = tokenizer( + negative_prompt, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + negative_prompt_embeds = text_encoder( + uncond_input.input_ids.to(device), + output_hidden_states=True, + ) + # We are only ALWAYS interested in the pooled output of the final text encoder + negative_pooled_prompt_embeds = negative_prompt_embeds[0] + negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2] + + negative_prompt_embeds_list.append(negative_prompt_embeds) + + negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1) + + if self.text_encoder_2 is not None: + prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device) + else: + prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, seq_len, -1) + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + if self.text_encoder_2 is not None: + negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device) + else: + negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1) + + pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_videos_per_prompt).view( + bs_embed * num_videos_per_prompt, -1 + ) + if do_classifier_free_guidance: + negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_videos_per_prompt).view( + bs_embed * num_videos_per_prompt, -1 + ) + + if self.text_encoder is not None: + if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder, lora_scale) + + if self.text_encoder_2 is not None: + if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder_2, lora_scale) + + return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] + + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) + + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) + + if do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + + image_embeds.append(single_image_embeds) + else: + repeat_dims = [1] + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + image_embeds.append(single_image_embeds) + + return image_embeds + + # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents + def decode_latents(self, latents): + latents = 1 / self.vae.config.scaling_factor * latents + + batch_size, channels, num_frames, height, width = latents.shape + latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width) + + image = self.vae.decode(latents).sample + video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + video = video.float() + return video + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + prompt_2, + height, + width, + negative_prompt=None, + negative_prompt_2=None, + prompt_embeds=None, + negative_prompt_embeds=None, + pooled_prompt_embeds=None, + negative_pooled_prompt_embeds=None, + callback_on_step_end_tensor_inputs=None, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt_2 is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)): + raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + elif negative_prompt_2 is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + if prompt_embeds is not None and pooled_prompt_embeds is None: + raise ValueError( + "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`." + ) + + if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None: + raise ValueError( + "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`." + ) + + # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents + def prepare_latents( + self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None + ): + shape = ( + batch_size, + num_channels_latents, + num_frames, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + def _get_add_time_ids( + self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None + ): + add_time_ids = list(original_size + crops_coords_top_left + target_size) + + passed_add_embed_dim = ( + self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim + ) + expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features + + if expected_add_embed_dim != passed_add_embed_dim: + raise ValueError( + f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." + ) + + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + return add_time_ids + + def upcast_vae(self): + dtype = self.vae.dtype + self.vae.to(dtype=torch.float32) + use_torch_2_0_or_xformers = isinstance( + self.vae.decoder.mid_block.attentions[0].processor, + ( + AttnProcessor2_0, + XFormersAttnProcessor, + LoRAXFormersAttnProcessor, + LoRAAttnProcessor2_0, + FusedAttnProcessor2_0, + ), + ) + # if xformers or torch_2_0 is used attention block does not need + # to be in float32 which can save lots of memory + if use_torch_2_0_or_xformers: + self.vae.post_quant_conv.to(dtype) + self.vae.decoder.conv_in.to(dtype) + self.vae.decoder.mid_block.to(dtype) + + # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding + def get_guidance_scale_embedding( + self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 + ) -> torch.FloatTensor: + """ + See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 + + Args: + w (`torch.Tensor`): + Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings. + embedding_dim (`int`, *optional*, defaults to 512): + Dimension of the embeddings to generate. + dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): + Data type of the generated embeddings. + + Returns: + `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + """ + assert len(w.shape) == 1 + w = w * 1000.0 + + half_dim = embedding_dim // 2 + emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) + emb = w.to(dtype)[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1)) + assert emb.shape == (w.shape[0], embedding_dim) + return emb + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def guidance_rescale(self): + return self._guidance_rescale + + @property + def clip_skip(self): + return self._clip_skip + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + @property + def do_classifier_free_guidance(self): + return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None + + @property + def cross_attention_kwargs(self): + return self._cross_attention_kwargs + + @property + def denoising_end(self): + return self._denoising_end + + @property + def num_timesteps(self): + return self._num_timesteps + + @property + def interrupt(self): + return self._interrupt + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + prompt_2: Optional[Union[str, List[str]]] = None, + num_frames: int = 16, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + denoising_end: Optional[float] = None, + guidance_scale: float = 5.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + negative_prompt_2: Optional[Union[str, List[str]]] = None, + num_videos_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guidance_rescale: float = 0.0, + original_size: Optional[Tuple[int, int]] = None, + crops_coords_top_left: Tuple[int, int] = (0, 0), + target_size: Optional[Tuple[int, int]] = None, + negative_original_size: Optional[Tuple[int, int]] = None, + negative_crops_coords_top_left: Tuple[int, int] = (0, 0), + negative_target_size: Optional[Tuple[int, int]] = None, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the video generation. If not defined, one has to pass `prompt_embeds`. + instead. + prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is + used in both text-encoders + num_frames: + The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds + amounts to 2 seconds of video. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated video. This is set to 1024 by default for the best results. + Anything below 512 pixels won't work well for + [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) + and checkpoints that are not specifically fine-tuned on low resolutions. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated video. This is set to 1024 by default for the best results. + Anything below 512 pixels won't work well for + [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) + and checkpoints that are not specifically fine-tuned on low resolutions. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality video at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + denoising_end (`float`, *optional*): + When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be + completed before it is intentionally prematurely terminated. As a result, the returned sample will + still retain a substantial amount of noise as determined by the discrete timesteps selected by the + scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a + "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image + Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output) + guidance_scale (`float`, *optional*, defaults to 5.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower video quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the video generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + negative_prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the video generation to be sent to `tokenizer_2` and + `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders + num_videos_per_prompt (`int`, *optional*, defaults to 1): + The number of videos to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided, pooled text embeddings will be generated from `prompt` input argument. + negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` + input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): + Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the + `ip_adapter_image` input argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated video. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion_xl.AnimateDiffPipelineOutput`] instead of a + plain tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + guidance_rescale (`float`, *optional*, defaults to 0.0): + Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of + [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). + Guidance rescale factor should fix overexposure when using zero terminal SNR. + original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. + `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as + explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): + `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position + `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting + `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + For most cases, `target_size` should be set to the desired height and width of the generated image. If + not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in + section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + To negatively condition the generation process based on a specific image resolution. Part of SDXL's + micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more + information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. + negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): + To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's + micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more + information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. + negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + To negatively condition the generation process based on a target image resolution. It should be as same + as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more + information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + + Examples: + + Returns: + [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is + returned, otherwise a `tuple` is returned where the first element is a list with the generated frames. + """ + + # 0. Default height and width to unet + height = height or self.default_sample_size * self.vae_scale_factor + width = width or self.default_sample_size * self.vae_scale_factor + + num_videos_per_prompt = 1 + + original_size = original_size or (height, width) + target_size = target_size or (height, width) + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + prompt_2, + height, + width, + negative_prompt, + negative_prompt_2, + prompt_embeds, + negative_prompt_embeds, + pooled_prompt_embeds, + negative_pooled_prompt_embeds, + callback_on_step_end_tensor_inputs, + ) + + self._guidance_scale = guidance_scale + self._guidance_rescale = guidance_rescale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + self._denoising_end = denoising_end + self._interrupt = False + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + # 3. Encode input prompt + lora_scale = ( + self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None + ) + + ( + prompt_embeds, + negative_prompt_embeds, + pooled_prompt_embeds, + negative_pooled_prompt_embeds, + ) = self.encode_prompt( + prompt=prompt, + prompt_2=prompt_2, + device=device, + num_videos_per_prompt=num_videos_per_prompt, + do_classifier_free_guidance=self.do_classifier_free_guidance, + negative_prompt=negative_prompt, + negative_prompt_2=negative_prompt_2, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, + lora_scale=lora_scale, + clip_skip=self.clip_skip, + ) + + # 4. Prepare timesteps + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + + # 5. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * num_videos_per_prompt, + num_channels_latents, + num_frames, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7. Prepare added time ids & embeddings + add_text_embeds = pooled_prompt_embeds + if self.text_encoder_2 is None: + text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1]) + else: + text_encoder_projection_dim = self.text_encoder_2.config.projection_dim + + add_time_ids = self._get_add_time_ids( + original_size, + crops_coords_top_left, + target_size, + dtype=prompt_embeds.dtype, + text_encoder_projection_dim=text_encoder_projection_dim, + ) + if negative_original_size is not None and negative_target_size is not None: + negative_add_time_ids = self._get_add_time_ids( + negative_original_size, + negative_crops_coords_top_left, + negative_target_size, + dtype=prompt_embeds.dtype, + text_encoder_projection_dim=text_encoder_projection_dim, + ) + else: + negative_add_time_ids = add_time_ids + + if self.do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) + add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0) + add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0) + + prompt_embeds = prompt_embeds.to(device) + add_text_embeds = add_text_embeds.to(device) + add_time_ids = add_time_ids.to(device).repeat(batch_size * num_videos_per_prompt, 1) + + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_videos_per_prompt, + self.do_classifier_free_guidance, + ) + + # 7.1 Apply denoising_end + if ( + self.denoising_end is not None + and isinstance(self.denoising_end, float) + and self.denoising_end > 0 + and self.denoising_end < 1 + ): + discrete_timestep_cutoff = int( + round( + self.scheduler.config.num_train_timesteps + - (self.denoising_end * self.scheduler.config.num_train_timesteps) + ) + ) + num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps))) + timesteps = timesteps[:num_inference_steps] + + # 8. Optionally get Guidance Scale Embedding + timestep_cond = None + if self.unet.config.time_cond_proj_dim is not None: + guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_videos_per_prompt) + timestep_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim + ).to(device=device, dtype=latents.dtype) + + num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1 + for free_init_iter in range(num_free_init_iters): + if self.free_init_enabled: + latents, timesteps = self._apply_free_init( + latents, free_init_iter, num_inference_steps, device, latents.dtype, generator + ) + + self._num_timesteps = len(timesteps) + + # 9. Denoising loop + with self.progress_bar(total=self._num_timesteps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents + + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} + if ip_adapter_image is not None or ip_adapter_image_embeds: + added_cond_kwargs["image_embeds"] = image_embeds + + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + timestep_cond=timestep_cond, + cross_attention_kwargs=self.cross_attention_kwargs, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + + if self.do_classifier_free_guidance and self.guidance_rescale > 0.0: + # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf + noise_pred = rescale_noise_cfg( + noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale + ) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds) + negative_pooled_prompt_embeds = callback_outputs.pop( + "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds + ) + add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids) + negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids) + + progress_bar.update() + + # make sure the VAE is in float32 mode, as it overflows in float16 + needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast + + if needs_upcasting: + self.upcast_vae() + latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype) + + # 10. Post processing + if output_type == "latent": + video = latents + else: + video_tensor = self.decode_latents(latents) + video = tensor2vid(video_tensor, self.image_processor, output_type=output_type) + + # cast back to fp16 if needed + if needs_upcasting: + self.vae.to(dtype=torch.float16) + + # 11. Offload all models + self.maybe_free_model_hooks() + + if not return_dict: + return (video,) + + return AnimateDiffPipelineOutput(frames=video) diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index b750b60491a4..0583cf839ff7 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -92,6 +92,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class AnimateDiffSDXLPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class AnimateDiffVideoToVideoPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/tests/pipelines/animatediff/test_animatediff_sdxl.py b/tests/pipelines/animatediff/test_animatediff_sdxl.py new file mode 100644 index 000000000000..2db0139154e9 --- /dev/null +++ b/tests/pipelines/animatediff/test_animatediff_sdxl.py @@ -0,0 +1,307 @@ +import unittest + +import numpy as np +import torch +from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer + +import diffusers +from diffusers import ( + AnimateDiffSDXLPipeline, + AutoencoderKL, + DDIMScheduler, + MotionAdapter, + UNet2DConditionModel, + UNetMotionModel, +) +from diffusers.utils import is_xformers_available, logging +from diffusers.utils.testing_utils import torch_device + +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import ( + IPAdapterTesterMixin, + PipelineTesterMixin, + SDFunctionTesterMixin, + SDXLOptionalComponentsTesterMixin, +) + + +def to_np(tensor): + if isinstance(tensor, torch.Tensor): + tensor = tensor.detach().cpu().numpy() + + return tensor + + +class AnimateDiffPipelineSDXLFastTests( + IPAdapterTesterMixin, + SDFunctionTesterMixin, + PipelineTesterMixin, + SDXLOptionalComponentsTesterMixin, + unittest.TestCase, +): + pipeline_class = AnimateDiffSDXLPipeline + params = TEXT_TO_IMAGE_PARAMS + batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + required_optional_params = frozenset( + [ + "num_inference_steps", + "generator", + "latents", + "return_dict", + "callback_on_step_end", + "callback_on_step_end_tensor_inputs", + ] + ) + callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union({"add_text_embeds", "add_time_ids"}) + + def get_dummy_components(self, time_cond_proj_dim=None): + torch.manual_seed(0) + unet = UNet2DConditionModel( + block_out_channels=(32, 64, 128), + layers_per_block=2, + time_cond_proj_dim=time_cond_proj_dim, + sample_size=32, + in_channels=4, + out_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "UpBlock2D"), + # SD2-specific config below + attention_head_dim=(2, 4, 8), + use_linear_projection=True, + addition_embed_type="text_time", + addition_time_embed_dim=8, + transformer_layers_per_block=(1, 2, 4), + projection_class_embeddings_input_dim=80, # 6 * 8 + 32 + cross_attention_dim=64, + norm_num_groups=1, + ) + scheduler = DDIMScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="linear", + clip_sample=False, + ) + torch.manual_seed(0) + vae = AutoencoderKL( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + sample_size=128, + ) + torch.manual_seed(0) + text_encoder_config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + # SD2-specific config below + hidden_act="gelu", + projection_dim=32, + ) + text_encoder = CLIPTextModel(text_encoder_config) + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config) + tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + motion_adapter = MotionAdapter( + block_out_channels=(32, 64, 128), + motion_layers_per_block=2, + motion_norm_num_groups=2, + motion_num_attention_heads=4, + use_motion_mid_block=False, + ) + + components = { + "unet": unet, + "scheduler": scheduler, + "vae": vae, + "motion_adapter": motion_adapter, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "text_encoder_2": text_encoder_2, + "tokenizer_2": tokenizer_2, + "feature_extractor": None, + "image_encoder": None, + } + return components + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "generator": generator, + "num_inference_steps": 2, + "guidance_scale": 7.5, + "output_type": "np", + } + return inputs + + def test_motion_unet_loading(self): + components = self.get_dummy_components() + pipe = AnimateDiffSDXLPipeline(**components) + + assert isinstance(pipe.unet, UNetMotionModel) + + @unittest.skip("Attention slicing is not enabled in this pipeline") + def test_attention_slicing_forward_pass(self): + pass + + def test_inference_batch_single_identical( + self, + batch_size=2, + expected_max_diff=1e-4, + additional_params_copy_to_batched_inputs=["num_inference_steps"], + ): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + for components in pipe.components.values(): + if hasattr(components, "set_default_attn_processor"): + components.set_default_attn_processor() + + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + inputs = self.get_dummy_inputs(torch_device) + # Reset generator in case it is has been used in self.get_dummy_inputs + inputs["generator"] = self.get_generator(0) + + logger = logging.get_logger(pipe.__module__) + logger.setLevel(level=diffusers.logging.FATAL) + + # batchify inputs + batched_inputs = {} + batched_inputs.update(inputs) + + for name in self.batch_params: + if name not in inputs: + continue + + value = inputs[name] + if name == "prompt": + len_prompt = len(value) + batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)] + batched_inputs[name][-1] = 100 * "very long" + + else: + batched_inputs[name] = batch_size * [value] + + if "generator" in inputs: + batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)] + + if "batch_size" in inputs: + batched_inputs["batch_size"] = batch_size + + for arg in additional_params_copy_to_batched_inputs: + batched_inputs[arg] = inputs[arg] + + output = pipe(**inputs) + output_batch = pipe(**batched_inputs) + + assert output_batch[0].shape[0] == batch_size + + max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max() + assert max_diff < expected_max_diff + + @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices") + def test_to_device(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.set_progress_bar_config(disable=None) + + pipe.to("cpu") + # pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components + model_devices = [ + component.device.type for component in pipe.components.values() if hasattr(component, "device") + ] + self.assertTrue(all(device == "cpu" for device in model_devices)) + + output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0] + self.assertTrue(np.isnan(output_cpu).sum() == 0) + + pipe.to("cuda") + model_devices = [ + component.device.type for component in pipe.components.values() if hasattr(component, "device") + ] + self.assertTrue(all(device == "cuda" for device in model_devices)) + + output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0] + self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0) + + def test_to_dtype(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.set_progress_bar_config(disable=None) + + # pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components + model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] + self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes)) + + pipe.to(dtype=torch.float16) + model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")] + self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes)) + + def test_prompt_embeds(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.set_progress_bar_config(disable=None) + pipe.to(torch_device) + + inputs = self.get_dummy_inputs(torch_device) + prompt = inputs.pop("prompt") + + ( + prompt_embeds, + negative_prompt_embeds, + pooled_prompt_embeds, + negative_pooled_prompt_embeds, + ) = pipe.encode_prompt(prompt) + + pipe( + **inputs, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, + ) + + def test_save_load_optional_components(self): + self._test_save_load_optional_components() + + @unittest.skipIf( + torch_device != "cuda" or not is_xformers_available(), + reason="XFormers attention is only available with CUDA and `xformers` installed", + ) + def test_xformers_attention_forwardGenerator_pass(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + for component in pipe.components.values(): + if hasattr(component, "set_default_attn_processor"): + component.set_default_attn_processor() + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(torch_device) + output_without_offload = pipe(**inputs).frames[0] + output_without_offload = ( + output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload + ) + + pipe.enable_xformers_memory_efficient_attention() + inputs = self.get_dummy_inputs(torch_device) + output_with_offload = pipe(**inputs).frames[0] + output_with_offload = ( + output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload + ) + + max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max() + self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results") From 35358a2decad0c1e2972a8311ff49bfc14a326e1 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Wed, 8 May 2024 07:59:08 -1000 Subject: [PATCH 53/56] fix offload test (#7868) fix Co-authored-by: Dhruv Nair --- tests/pipelines/test_pipelines_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 032fbb81ea31..c4b7a3b9187a 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -1375,7 +1375,7 @@ def test_sequential_cpu_offload_forward_pass(self, expected_max_diff=1e-4): output_without_offload = pipe(**inputs)[0] pipe.enable_sequential_cpu_offload() - assert pipe._execution_device.type == pipe._offload_device.type + assert pipe._execution_device.type == "cuda" inputs = self.get_dummy_inputs(generator_device) output_with_offload = pipe(**inputs)[0] @@ -1440,7 +1440,7 @@ def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4): output_without_offload = pipe(**inputs)[0] pipe.enable_model_cpu_offload() - assert pipe._execution_device.type == pipe._offload_device.type + assert pipe._execution_device.type == "cuda" inputs = self.get_dummy_inputs(generator_device) output_with_offload = pipe(**inputs)[0] From 75aab34675bf22a8ad12066cf0178c7c7c79bdd0 Mon Sep 17 00:00:00 2001 From: Pierre Dulac Date: Wed, 8 May 2024 22:41:58 +0200 Subject: [PATCH 54/56] Allow users to save SDXL LoRA weights for only one text encoder (#7607) SDXL LoRA weights for text encoders should be decoupled on save The method checks if at least one of unet, text_encoder and text_encoder_2 lora weights are passed, which was not reflected in the implentation. Co-authored-by: Sayak Paul Co-authored-by: YiYi Xu --- src/diffusers/loaders/lora.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/diffusers/loaders/lora.py b/src/diffusers/loaders/lora.py index 2f525986a096..24be4999de27 100644 --- a/src/diffusers/loaders/lora.py +++ b/src/diffusers/loaders/lora.py @@ -1406,6 +1406,9 @@ def save_lora_weights( text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text encoder LoRA state dict because it comes from 🤗 Transformers. + text_encoder_2_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): + State dict of the LoRA layers corresponding to the `text_encoder_2`. Must explicitly pass the text + encoder LoRA state dict because it comes from 🤗 Transformers. is_main_process (`bool`, *optional*, defaults to `True`): Whether the process calling this is the main process or not. Useful during distributed training and you need to call this function on all processes. In this case, set `is_main_process=True` only on the main @@ -1432,8 +1435,10 @@ def pack_weights(layers, prefix): if unet_lora_layers: state_dict.update(pack_weights(unet_lora_layers, "unet")) - if text_encoder_lora_layers and text_encoder_2_lora_layers: + if text_encoder_lora_layers: state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder")) + + if text_encoder_2_lora_layers: state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2")) cls.write_lora_layers( From c1c42698c955959d7ef34af129428f64c6e363bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tolga=20Cang=C3=B6z?= <46008593+standardAI@users.noreply.github.com> Date: Thu, 9 May 2024 02:15:28 +0300 Subject: [PATCH 55/56] Remove dead code and fix f-string issue (#7720) * Remove dead code * PylancereportGeneralTypeIssues: Strings nested within an f-string cannot use the same quote character as the f-string prior to Python 3.12. * Remove dead code --- examples/community/checkpoint_merger.py | 1 - examples/community/latent_consistency_img2img.py | 8 -------- scripts/convert_zero123_to_diffusers.py | 2 +- src/diffusers/image_processor.py | 1 - 4 files changed, 1 insertion(+), 11 deletions(-) diff --git a/examples/community/checkpoint_merger.py b/examples/community/checkpoint_merger.py index 9df5943a86b1..f702bf0cea9b 100644 --- a/examples/community/checkpoint_merger.py +++ b/examples/community/checkpoint_merger.py @@ -138,7 +138,6 @@ def merge(self, pretrained_model_name_or_path_list: List[Union[str, os.PathLike] comparison_result &= self._compare_model_configs(config_dicts[idx - 1], config_dicts[idx]) if not force and comparison_result is False: raise ValueError("Incompatible checkpoints. Please check model_index.json for the models.") - print(config_dicts[0], config_dicts[1]) print("Compatible model_index.json files found") # Step 2: Basic Validation has succeeded. Let's download the models and save them into our local files. cached_folders = [] diff --git a/examples/community/latent_consistency_img2img.py b/examples/community/latent_consistency_img2img.py index 98078a2eef96..3c5ffa845699 100644 --- a/examples/community/latent_consistency_img2img.py +++ b/examples/community/latent_consistency_img2img.py @@ -240,14 +240,6 @@ def prepare_latents( return latents - if latents is None: - latents = torch.randn(shape, dtype=dtype).to(device) - else: - latents = latents.to(device) - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * self.scheduler.init_noise_sigma - return latents - def get_w_embedding(self, w, embedding_dim=512, dtype=torch.float32): """ see https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 diff --git a/scripts/convert_zero123_to_diffusers.py b/scripts/convert_zero123_to_diffusers.py index 669a4962be3c..b46633fae7ff 100644 --- a/scripts/convert_zero123_to_diffusers.py +++ b/scripts/convert_zero123_to_diffusers.py @@ -113,7 +113,7 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa assert "adm_in_channels" in unet_params projection_class_embeddings_input_dim = unet_params["adm_in_channels"] else: - raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params["num_classes"]}") + raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params['num_classes']}") config = { "sample_size": image_size // vae_scale_factor, diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 4ccb9d77d627..0f4481570829 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -80,7 +80,6 @@ def __init__( " if you intended to convert the image into RGB format, please set `do_convert_grayscale = False`.", " if you intended to convert the image into grayscale format, please set `do_convert_rgb = False`", ) - self.config.do_convert_rgb = False @staticmethod def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]: From caf9e985df761413f8bbeea67eb406b86daa71a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tolga=20Cang=C3=B6z?= <46008593+standardAI@users.noreply.github.com> Date: Thu, 9 May 2024 08:34:44 +0300 Subject: [PATCH 56/56] Fix several imports (#7712) Fix imports --- src/diffusers/pipelines/stable_diffusion/__init__.py | 1 - src/diffusers/schedulers/deprecated/__init__.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py index 0eda32d333b9..8ce08aec66ca 100644 --- a/src/diffusers/pipelines/stable_diffusion/__init__.py +++ b/src/diffusers/pipelines/stable_diffusion/__init__.py @@ -113,7 +113,6 @@ from .pipeline_stable_diffusion import ( StableDiffusionPipeline, StableDiffusionPipelineOutput, - StableDiffusionSafetyChecker, ) from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline diff --git a/src/diffusers/schedulers/deprecated/__init__.py b/src/diffusers/schedulers/deprecated/__init__.py index 786707f45206..479cf9bd568b 100644 --- a/src/diffusers/schedulers/deprecated/__init__.py +++ b/src/diffusers/schedulers/deprecated/__init__.py @@ -30,7 +30,7 @@ raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from ..utils.dummy_pt_objects import * # noqa F403 + from ...utils.dummy_pt_objects import * # noqa F403 else: from .scheduling_karras_ve import KarrasVeScheduler from .scheduling_sde_vp import ScoreSdeVpScheduler