From fdec8bd6754f8ae5428fb542f08707e0a5aba24e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Somoza?= Date: Thu, 28 Nov 2024 12:57:55 -0500 Subject: [PATCH 01/38] Change image_gen_aux repository URL (#10048) change image_gen_aux repo url --- docs/source/en/api/pipelines/flux.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/api/pipelines/flux.md b/docs/source/en/api/pipelines/flux.md index 94624264646f..f776dc049ebd 100644 --- a/docs/source/en/api/pipelines/flux.md +++ b/docs/source/en/api/pipelines/flux.md @@ -148,7 +148,7 @@ image.save("output.png") **Note:** `black-forest-labs/Flux.1-Depth-dev` is _not_ a ControlNet model. [`ControlNetModel`] models are a separate component from the UNet/Transformer whose residuals are added to the actual underlying model. Depth Control is an alternate architecture that achieves effectively the same results as a ControlNet model would, by using channel-wise concatenation with input control condition and ensuring the transformer learns structure control by following the condition as closely as possible. ```python -# !pip install git+https://github.com/asomoza/image_gen_aux.git +# !pip install git+https://github.com/huggingface/image_gen_aux import torch from diffusers import FluxControlPipeline, FluxTransformer2DModel from diffusers.utils import load_image From 6b288ec44d80cf1fde57fac8e0e625f7fc0d720f Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Fri, 29 Nov 2024 14:03:41 +0800 Subject: [PATCH 02/38] make `pipelines` tests device-agnostic (part2) (#9400) * enable on xpu * add 1 more * add one more * enable more * add 1 more * add more * enable 1 * enable more cases * enable * enable * update comment * one more * enable 1 * add more cases * enable xpu * add one more caswe * add more cases * add 1 * add more * add more cases * add case * enable * add more * add more * add more * enbale more * add more * update code * update test marker * add skip back * update comment * remove single files * remove * style * add * revert * reformat * enable * enable esingle g * add 2 more * update decorator * update * update * update * Update tests/pipelines/deepfloyd_if/test_if.py Co-authored-by: Dhruv Nair * Update src/diffusers/utils/testing_utils.py Co-authored-by: Dhruv Nair * Update tests/pipelines/animatediff/test_animatediff_controlnet.py Co-authored-by: Dhruv Nair * Update tests/pipelines/animatediff/test_animatediff.py Co-authored-by: Dhruv Nair * Update tests/pipelines/animatediff/test_animatediff_controlnet.py Co-authored-by: Dhruv Nair * update float16 * no unitest.skipt * update * apply style check * adapt style --------- Co-authored-by: Sayak Paul Co-authored-by: Dhruv Nair --- tests/single_file/single_file_testing_utils.py | 4 ++-- .../test_model_controlnet_single_file.py | 10 ++++++---- .../test_model_sd_cascade_unet_single_file.py | 10 ++++++---- tests/single_file/test_model_vae_single_file.py | 9 +++++---- ...e_diffusion_controlnet_img2img_single_file.py | 9 +++++---- ...e_diffusion_controlnet_inpaint_single_file.py | 10 ++++++---- ...st_stable_diffusion_controlnet_single_file.py | 10 ++++++---- .../test_stable_diffusion_img2img_single_file.py | 16 +++++++++------- .../test_stable_diffusion_inpaint_single_file.py | 16 +++++++++------- .../test_stable_diffusion_single_file.py | 14 ++++++++------ .../test_stable_diffusion_upscale_single_file.py | 10 ++++++---- ...st_stable_diffusion_xl_adapter_single_file.py | 10 ++++++---- ...stable_diffusion_xl_controlnet_single_file.py | 9 +++++---- ...st_stable_diffusion_xl_img2img_single_file.py | 12 +++++++----- .../test_stable_diffusion_xl_instruct_pix2pix.py | 10 ++++++---- .../test_stable_diffusion_xl_single_file.py | 10 ++++++---- 16 files changed, 98 insertions(+), 71 deletions(-) diff --git a/tests/single_file/single_file_testing_utils.py b/tests/single_file/single_file_testing_utils.py index 9b89578c5a8c..d4f6ec994231 100644 --- a/tests/single_file/single_file_testing_utils.py +++ b/tests/single_file/single_file_testing_utils.py @@ -156,14 +156,14 @@ def test_single_file_components_with_original_config_local_files_only( def test_single_file_format_inference_is_same_as_pretrained(self, expected_max_diff=1e-4): sf_pipe = self.pipeline_class.from_single_file(self.ckpt_path, safety_checker=None) sf_pipe.unet.set_attn_processor(AttnProcessor()) - sf_pipe.enable_model_cpu_offload() + sf_pipe.enable_model_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device) image_single_file = sf_pipe(**inputs).images[0] pipe = self.pipeline_class.from_pretrained(self.repo_id, safety_checker=None) pipe.unet.set_attn_processor(AttnProcessor()) - pipe.enable_model_cpu_offload() + pipe.enable_model_cpu_offload(device=torch_device) inputs = self.get_inputs(torch_device) image = pipe(**inputs).images[0] diff --git a/tests/single_file/test_model_controlnet_single_file.py b/tests/single_file/test_model_controlnet_single_file.py index 1d5b790ebb4a..bfcb802380a6 100644 --- a/tests/single_file/test_model_controlnet_single_file.py +++ b/tests/single_file/test_model_controlnet_single_file.py @@ -22,9 +22,11 @@ ControlNetModel, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) @@ -32,7 +34,7 @@ @slow -@require_torch_gpu +@require_torch_accelerator class ControlNetModelSingleFileTests(unittest.TestCase): model_class = ControlNetModel ckpt_path = "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth" @@ -41,12 +43,12 @@ class ControlNetModelSingleFileTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_single_file_components(self): model = self.model_class.from_pretrained(self.repo_id) diff --git a/tests/single_file/test_model_sd_cascade_unet_single_file.py b/tests/single_file/test_model_sd_cascade_unet_single_file.py index 43253eb6d59f..08b04e3cd7e8 100644 --- a/tests/single_file/test_model_sd_cascade_unet_single_file.py +++ b/tests/single_file/test_model_sd_cascade_unet_single_file.py @@ -21,9 +21,11 @@ from diffusers import StableCascadeUNet from diffusers.utils import logging from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) @@ -33,17 +35,17 @@ @slow -@require_torch_gpu +@require_torch_accelerator class StableCascadeUNetSingleFileTest(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_single_file_components_stage_b(self): model_single_file = StableCascadeUNet.from_single_file( diff --git a/tests/single_file/test_model_vae_single_file.py b/tests/single_file/test_model_vae_single_file.py index 63f2bb757472..9db4cddb3c9d 100644 --- a/tests/single_file/test_model_vae_single_file.py +++ b/tests/single_file/test_model_vae_single_file.py @@ -22,10 +22,11 @@ AutoencoderKL, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, load_hf_numpy, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -35,7 +36,7 @@ @slow -@require_torch_gpu +@require_torch_accelerator class AutoencoderKLSingleFileTests(unittest.TestCase): model_class = AutoencoderKL ckpt_path = ( @@ -48,12 +49,12 @@ class AutoencoderKLSingleFileTests(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_file_format(self, seed, shape): return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy" diff --git a/tests/single_file/test_stable_diffusion_controlnet_img2img_single_file.py b/tests/single_file/test_stable_diffusion_controlnet_img2img_single_file.py index 332bcfbe03b6..8c312b1285e2 100644 --- a/tests/single_file/test_stable_diffusion_controlnet_img2img_single_file.py +++ b/tests/single_file/test_stable_diffusion_controlnet_img2img_single_file.py @@ -8,9 +8,10 @@ from diffusers.loaders.single_file_utils import _extract_repo_id_and_weights_name from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -27,7 +28,7 @@ @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionControlNetPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): pipeline_class = StableDiffusionControlNetPipeline ckpt_path = ( @@ -41,12 +42,12 @@ class StableDiffusionControlNetPipelineSingleFileSlowTests(unittest.TestCase, SD def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/single_file/test_stable_diffusion_controlnet_inpaint_single_file.py b/tests/single_file/test_stable_diffusion_controlnet_inpaint_single_file.py index c0d70123b286..37879f36561f 100644 --- a/tests/single_file/test_stable_diffusion_controlnet_inpaint_single_file.py +++ b/tests/single_file/test_stable_diffusion_controlnet_inpaint_single_file.py @@ -8,10 +8,12 @@ from diffusers.loaders.single_file_utils import _extract_repo_id_and_weights_name from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from .single_file_testing_utils import ( @@ -26,7 +28,7 @@ @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionControlNetInpaintPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): pipeline_class = StableDiffusionControlNetInpaintPipeline ckpt_path = "https://huggingface.co/botp/stable-diffusion-v1-5-inpainting/blob/main/sd-v1-5-inpainting.ckpt" @@ -36,12 +38,12 @@ class StableDiffusionControlNetInpaintPipelineSingleFileSlowTests(unittest.TestC def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self): control_image = load_image( diff --git a/tests/single_file/test_stable_diffusion_controlnet_single_file.py b/tests/single_file/test_stable_diffusion_controlnet_single_file.py index 3b5cf910b080..ef9fb8a3b1e4 100644 --- a/tests/single_file/test_stable_diffusion_controlnet_single_file.py +++ b/tests/single_file/test_stable_diffusion_controlnet_single_file.py @@ -8,10 +8,12 @@ from diffusers.loaders.single_file_utils import _extract_repo_id_and_weights_name from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from .single_file_testing_utils import ( @@ -26,7 +28,7 @@ @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionControlNetPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): pipeline_class = StableDiffusionControlNetPipeline ckpt_path = ( @@ -40,12 +42,12 @@ class StableDiffusionControlNetPipelineSingleFileSlowTests(unittest.TestCase, SD def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self): control_image = load_image( diff --git a/tests/single_file/test_stable_diffusion_img2img_single_file.py b/tests/single_file/test_stable_diffusion_img2img_single_file.py index 04f36f255014..9ad935582409 100644 --- a/tests/single_file/test_stable_diffusion_img2img_single_file.py +++ b/tests/single_file/test_stable_diffusion_img2img_single_file.py @@ -8,9 +8,11 @@ ) from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from .single_file_testing_utils import SDSingleFileTesterMixin @@ -20,7 +22,7 @@ @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionImg2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): pipeline_class = StableDiffusionImg2ImgPipeline ckpt_path = ( @@ -34,12 +36,12 @@ class StableDiffusionImg2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDSin def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -63,7 +65,7 @@ def test_single_file_format_inference_is_same_as_pretrained(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusion21Img2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): pipeline_class = StableDiffusionImg2ImgPipeline ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-ema-pruned.safetensors" @@ -73,12 +75,12 @@ class StableDiffusion21Img2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDS def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/single_file/test_stable_diffusion_inpaint_single_file.py b/tests/single_file/test_stable_diffusion_inpaint_single_file.py index 5c6734a9a33e..b05a098c0bcb 100644 --- a/tests/single_file/test_stable_diffusion_inpaint_single_file.py +++ b/tests/single_file/test_stable_diffusion_inpaint_single_file.py @@ -8,9 +8,11 @@ ) from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from .single_file_testing_utils import SDSingleFileTesterMixin @@ -20,7 +22,7 @@ @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionInpaintPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): pipeline_class = StableDiffusionInpaintPipeline ckpt_path = "https://huggingface.co/botp/stable-diffusion-v1-5-inpainting/blob/main/sd-v1-5-inpainting.ckpt" @@ -30,12 +32,12 @@ class StableDiffusionInpaintPipelineSingleFileSlowTests(unittest.TestCase, SDSin def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -78,7 +80,7 @@ def test_single_file_components_with_original_config_local_files_only(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusion21InpaintPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): pipeline_class = StableDiffusionInpaintPipeline ckpt_path = ( @@ -90,12 +92,12 @@ class StableDiffusion21InpaintPipelineSingleFileSlowTests(unittest.TestCase, SDS def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/single_file/test_stable_diffusion_single_file.py b/tests/single_file/test_stable_diffusion_single_file.py index e46e87e18c18..71afda1b80bb 100644 --- a/tests/single_file/test_stable_diffusion_single_file.py +++ b/tests/single_file/test_stable_diffusion_single_file.py @@ -7,9 +7,11 @@ from diffusers import EulerDiscreteScheduler, StableDiffusionPipeline from diffusers.loaders.single_file_utils import _extract_repo_id_and_weights_name from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from .single_file_testing_utils import ( @@ -23,7 +25,7 @@ @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): pipeline_class = StableDiffusionPipeline ckpt_path = ( @@ -37,12 +39,12 @@ class StableDiffusionPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFile def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -95,12 +97,12 @@ class StableDiffusion21PipelineSingleFileSlowTests(unittest.TestCase, SDSingleFi def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/single_file/test_stable_diffusion_upscale_single_file.py b/tests/single_file/test_stable_diffusion_upscale_single_file.py index 3c26d001c2b0..f410bc92dfc5 100644 --- a/tests/single_file/test_stable_diffusion_upscale_single_file.py +++ b/tests/single_file/test_stable_diffusion_upscale_single_file.py @@ -8,10 +8,12 @@ ) from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from .single_file_testing_utils import SDSingleFileTesterMixin @@ -21,7 +23,7 @@ @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionUpscalePipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin): pipeline_class = StableDiffusionUpscalePipeline ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler/blob/main/x4-upscaler-ema.safetensors" @@ -31,12 +33,12 @@ class StableDiffusionUpscalePipelineSingleFileSlowTests(unittest.TestCase, SDSin def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def test_single_file_format_inference_is_same_as_pretrained(self): image = load_image( diff --git a/tests/single_file/test_stable_diffusion_xl_adapter_single_file.py b/tests/single_file/test_stable_diffusion_xl_adapter_single_file.py index ead77a1d6553..e9def9c0e1f4 100644 --- a/tests/single_file/test_stable_diffusion_xl_adapter_single_file.py +++ b/tests/single_file/test_stable_diffusion_xl_adapter_single_file.py @@ -11,10 +11,12 @@ from diffusers.loaders.single_file_utils import _extract_repo_id_and_weights_name from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from .single_file_testing_utils import ( @@ -29,7 +31,7 @@ @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionXLAdapterPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingleFileTesterMixin): pipeline_class = StableDiffusionXLAdapterPipeline ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors" @@ -41,12 +43,12 @@ class StableDiffusionXLAdapterPipelineSingleFileSlowTests(unittest.TestCase, SDX def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self): prompt = "toy" diff --git a/tests/single_file/test_stable_diffusion_xl_controlnet_single_file.py b/tests/single_file/test_stable_diffusion_xl_controlnet_single_file.py index 9491adf2dfa4..bd900d9d308a 100644 --- a/tests/single_file/test_stable_diffusion_xl_controlnet_single_file.py +++ b/tests/single_file/test_stable_diffusion_xl_controlnet_single_file.py @@ -8,9 +8,10 @@ from diffusers.loaders.single_file_utils import _extract_repo_id_and_weights_name from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -26,7 +27,7 @@ @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionXLControlNetPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingleFileTesterMixin): pipeline_class = StableDiffusionXLControlNetPipeline ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors" @@ -38,12 +39,12 @@ class StableDiffusionXLControlNetPipelineSingleFileSlowTests(unittest.TestCase, def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/single_file/test_stable_diffusion_xl_img2img_single_file.py b/tests/single_file/test_stable_diffusion_xl_img2img_single_file.py index 71b57eb7c6c9..60f6c18395ae 100644 --- a/tests/single_file/test_stable_diffusion_xl_img2img_single_file.py +++ b/tests/single_file/test_stable_diffusion_xl_img2img_single_file.py @@ -9,10 +9,12 @@ ) from diffusers.utils import load_image from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, numpy_cosine_similarity_distance, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from .single_file_testing_utils import SDXLSingleFileTesterMixin @@ -22,7 +24,7 @@ @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionXLImg2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingleFileTesterMixin): pipeline_class = StableDiffusionXLImg2ImgPipeline ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors" @@ -34,12 +36,12 @@ class StableDiffusionXLImg2ImgPipelineSingleFileSlowTests(unittest.TestCase, SDX def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) @@ -63,7 +65,7 @@ def test_single_file_format_inference_is_same_as_pretrained(self): @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionXLImg2ImgRefinerPipelineSingleFileSlowTests(unittest.TestCase): pipeline_class = StableDiffusionXLImg2ImgPipeline ckpt_path = ( diff --git a/tests/single_file/test_stable_diffusion_xl_instruct_pix2pix.py b/tests/single_file/test_stable_diffusion_xl_instruct_pix2pix.py index 7ebddc8555bb..5a014638633b 100644 --- a/tests/single_file/test_stable_diffusion_xl_instruct_pix2pix.py +++ b/tests/single_file/test_stable_diffusion_xl_instruct_pix2pix.py @@ -5,9 +5,11 @@ from diffusers import StableDiffusionXLInstructPix2PixPipeline from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) @@ -15,7 +17,7 @@ @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionXLInstructPix2PixPipeline(unittest.TestCase): pipeline_class = StableDiffusionXLInstructPix2PixPipeline ckpt_path = "https://huggingface.co/stabilityai/cosxl/blob/main/cosxl_edit.safetensors" @@ -25,12 +27,12 @@ class StableDiffusionXLInstructPix2PixPipeline(unittest.TestCase): def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) diff --git a/tests/single_file/test_stable_diffusion_xl_single_file.py b/tests/single_file/test_stable_diffusion_xl_single_file.py index a143a35a2bbc..77f58d859209 100644 --- a/tests/single_file/test_stable_diffusion_xl_single_file.py +++ b/tests/single_file/test_stable_diffusion_xl_single_file.py @@ -7,9 +7,11 @@ StableDiffusionXLPipeline, ) from diffusers.utils.testing_utils import ( + backend_empty_cache, enable_full_determinism, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from .single_file_testing_utils import SDXLSingleFileTesterMixin @@ -19,7 +21,7 @@ @slow -@require_torch_gpu +@require_torch_accelerator class StableDiffusionXLPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingleFileTesterMixin): pipeline_class = StableDiffusionXLPipeline ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors" @@ -31,12 +33,12 @@ class StableDiffusionXLPipelineSingleFileSlowTests(unittest.TestCase, SDXLSingle def setUp(self): super().setUp() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def tearDown(self): super().tearDown() gc.collect() - torch.cuda.empty_cache() + backend_empty_cache(torch_device) def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): generator = torch.Generator(device=generator_device).manual_seed(seed) From c96bfa5c80eca798d555a79a491043c311d0f608 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 29 Nov 2024 14:15:00 +0530 Subject: [PATCH 03/38] [Mochi-1] ensuring to compute the fourier features in FP32 in Mochi encoder (#10031) compute fourier features in FP32. --- src/diffusers/models/autoencoders/autoencoder_kl_mochi.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py b/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py index 0eabf3a26d7c..920b0b62fef6 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py @@ -437,7 +437,8 @@ def __init__(self, start: int = 6, stop: int = 8, step: int = 1): def forward(self, inputs: torch.Tensor) -> torch.Tensor: r"""Forward method of the `FourierFeatures` class.""" - + original_dtype = inputs.dtype + inputs = inputs.to(torch.float32) num_channels = inputs.shape[1] num_freqs = (self.stop - self.start) // self.step @@ -450,7 +451,7 @@ def forward(self, inputs: torch.Tensor) -> torch.Tensor: # Scale channels by frequency. h = w * h - return torch.cat([inputs, torch.sin(h), torch.cos(h)], dim=1) + return torch.cat([inputs, torch.sin(h), torch.cos(h)], dim=1).to(original_dtype) class MochiEncoder3D(nn.Module): From 784b351f32fad39ad8c8bb238faaa44090e00a08 Mon Sep 17 00:00:00 2001 From: SahilCarterr <110806554+SahilCarterr@users.noreply.github.com> Date: Mon, 2 Dec 2024 11:28:00 +0530 Subject: [PATCH 04/38] [Fix] Syntax error (#10068) fix syntax error --- scripts/convert_cogview3_to_diffusers.py | 2 +- scripts/convert_flux_to_diffusers.py | 2 +- scripts/convert_mochi_to_diffusers.py | 2 +- scripts/convert_sd3_to_diffusers.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/convert_cogview3_to_diffusers.py b/scripts/convert_cogview3_to_diffusers.py index 48cda2084240..605555ebdbef 100644 --- a/scripts/convert_cogview3_to_diffusers.py +++ b/scripts/convert_cogview3_to_diffusers.py @@ -36,7 +36,7 @@ from diffusers.utils.import_utils import is_accelerate_available -CTX = init_empty_weights if is_accelerate_available else nullcontext +CTX = init_empty_weights if is_accelerate_available() else nullcontext TOKENIZER_MAX_LENGTH = 224 diff --git a/scripts/convert_flux_to_diffusers.py b/scripts/convert_flux_to_diffusers.py index 33668fed8120..fccac70dd855 100644 --- a/scripts/convert_flux_to_diffusers.py +++ b/scripts/convert_flux_to_diffusers.py @@ -31,7 +31,7 @@ --vae """ -CTX = init_empty_weights if is_accelerate_available else nullcontext +CTX = init_empty_weights if is_accelerate_available() else nullcontext parser = argparse.ArgumentParser() parser.add_argument("--original_state_dict_repo_id", default=None, type=str) diff --git a/scripts/convert_mochi_to_diffusers.py b/scripts/convert_mochi_to_diffusers.py index 892fd871c554..9727deeb6b0c 100644 --- a/scripts/convert_mochi_to_diffusers.py +++ b/scripts/convert_mochi_to_diffusers.py @@ -10,7 +10,7 @@ from diffusers.utils.import_utils import is_accelerate_available -CTX = init_empty_weights if is_accelerate_available else nullcontext +CTX = init_empty_weights if is_accelerate_available() else nullcontext TOKENIZER_MAX_LENGTH = 256 diff --git a/scripts/convert_sd3_to_diffusers.py b/scripts/convert_sd3_to_diffusers.py index 1f9c434b39d0..0a3569efeab0 100644 --- a/scripts/convert_sd3_to_diffusers.py +++ b/scripts/convert_sd3_to_diffusers.py @@ -11,7 +11,7 @@ from diffusers.utils.import_utils import is_accelerate_available -CTX = init_empty_weights if is_accelerate_available else nullcontext +CTX = init_empty_weights if is_accelerate_available() else nullcontext parser = argparse.ArgumentParser() parser.add_argument("--checkpoint_path", type=str) From 827b6c25f9b78a297345f356a7d152fd6faf27d8 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 2 Dec 2024 14:53:43 +0530 Subject: [PATCH 05/38] [CI] Add quantization (#9832) * add quantization to nightly CI. * prep. * fix lib name. * remove deps that are not needed. * fix slice. --- .github/workflows/nightly_tests.yml | 58 +++++++++++++++++++++++ tests/quantization/bnb/test_4bit.py | 1 - tests/quantization/bnb/test_mixed_int8.py | 2 +- 3 files changed, 59 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml index b8e9860aec63..e2228fdacf30 100644 --- a/.github/workflows/nightly_tests.yml +++ b/.github/workflows/nightly_tests.yml @@ -347,6 +347,64 @@ jobs: pip install slack_sdk tabulate python utils/log_reports.py >> $GITHUB_STEP_SUMMARY + run_nightly_quantization_tests: + name: Torch quantization nightly tests + strategy: + fail-fast: false + max-parallel: 2 + matrix: + config: + - backend: "bitsandbytes" + test_location: "bnb" + runs-on: + group: aws-g6e-xlarge-plus + container: + image: diffusers/diffusers-pytorch-cuda + options: --shm-size "20gb" --ipc host --gpus 0 + steps: + - name: Checkout diffusers + uses: actions/checkout@v3 + with: + fetch-depth: 2 + - name: NVIDIA-SMI + run: nvidia-smi + - name: Install dependencies + run: | + python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" + python -m uv pip install -e [quality,test] + python -m uv pip install -U ${{ matrix.config.backend }} + python -m uv pip install pytest-reportlog + - name: Environment + run: | + python utils/print_env.py + - name: ${{ matrix.config.backend }} quantization tests on GPU + env: + HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }} + # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms + CUBLAS_WORKSPACE_CONFIG: :16:8 + BIG_GPU_MEMORY: 40 + run: | + python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \ + --make-reports=tests_${{ matrix.config.backend }}_torch_cuda \ + --report-log=tests_${{ matrix.config.backend }}_torch_cuda.log \ + tests/quantization/${{ matrix.config.test_location }} + - name: Failure short reports + if: ${{ failure() }} + run: | + cat reports/tests_${{ matrix.config.backend }}_torch_cuda_stats.txt + cat reports/tests_${{ matrix.config.backend }}_torch_cuda_failures_short.txt + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: torch_cuda_${{ matrix.config.backend }}_reports + path: reports + - name: Generate Report and Notify Channel + if: always() + run: | + pip install slack_sdk tabulate + python utils/log_reports.py >> $GITHUB_STEP_SUMMARY + # M1 runner currently not well supported # TODO: (Dhruv) add these back when we setup better testing for Apple Silicon # run_nightly_tests_apple_m1: diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py index 7b553434fbe9..b548b03be31d 100644 --- a/tests/quantization/bnb/test_4bit.py +++ b/tests/quantization/bnb/test_4bit.py @@ -432,7 +432,6 @@ def test_quality(self): expected_slice = np.array([0.1123, 0.1296, 0.1609, 0.1042, 0.1230, 0.1274, 0.0928, 0.1165, 0.1216]) max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice) - print(f"{max_diff=}") self.assertTrue(max_diff < 1e-2) def test_generate_quality_dequantize(self): diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index ba2402461c87..a67e8d38e961 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -369,7 +369,7 @@ def test_quality(self): output_type="np", ).images out_slice = output[0, -3:, -3:, -1].flatten() - expected_slice = np.array([0.0149, 0.0322, 0.0073, 0.0134, 0.0332, 0.011, 0.002, 0.0232, 0.0193]) + expected_slice = np.array([0.0376, 0.0359, 0.0015, 0.0449, 0.0479, 0.0098, 0.0083, 0.0295, 0.0295]) max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice) self.assertTrue(max_diff < 1e-2) From 8d386f7990194172e40f6da651e00f92312cd35e Mon Sep 17 00:00:00 2001 From: hlky Date: Mon, 2 Dec 2024 18:16:47 +0000 Subject: [PATCH 06/38] Add `sigmas` to Flux pipelines (#10081) --- src/diffusers/pipelines/flux/pipeline_flux.py | 15 +++++++-------- .../pipelines/flux/pipeline_flux_control.py | 15 +++++++-------- .../flux/pipeline_flux_control_img2img.py | 15 +++++++-------- .../pipelines/flux/pipeline_flux_controlnet.py | 15 +++++++-------- .../pipeline_flux_controlnet_image_to_image.py | 13 +++++++------ .../flux/pipeline_flux_controlnet_inpainting.py | 13 +++++++------ .../pipelines/flux/pipeline_flux_fill.py | 15 +++++++-------- .../pipelines/flux/pipeline_flux_img2img.py | 15 +++++++-------- .../pipelines/flux/pipeline_flux_inpaint.py | 15 +++++++-------- 9 files changed, 63 insertions(+), 68 deletions(-) diff --git a/src/diffusers/pipelines/flux/pipeline_flux.py b/src/diffusers/pipelines/flux/pipeline_flux.py index e0add1e60ce2..ec2801625552 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux.py +++ b/src/diffusers/pipelines/flux/pipeline_flux.py @@ -554,7 +554,7 @@ def __call__( height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 28, - timesteps: List[int] = None, + sigmas: Optional[List[float]] = None, guidance_scale: float = 3.5, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, @@ -585,10 +585,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -699,7 +699,7 @@ def __call__( ) # 5. Prepare timesteps - sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas image_seq_len = latents.shape[1] mu = calculate_shift( image_seq_len, @@ -712,8 +712,7 @@ def __call__( self.scheduler, num_inference_steps, device, - timesteps, - sigmas, + sigmas=sigmas, mu=mu, ) num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_control.py b/src/diffusers/pipelines/flux/pipeline_flux_control.py index 04a93ba6351c..dc3ca8cf7b09 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_control.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_control.py @@ -621,7 +621,7 @@ def __call__( height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 28, - timesteps: List[int] = None, + sigmas: Optional[List[float]] = None, guidance_scale: float = 3.5, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, @@ -660,10 +660,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -799,7 +799,7 @@ def __call__( ) # 5. Prepare timesteps - sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas image_seq_len = latents.shape[1] mu = calculate_shift( image_seq_len, @@ -812,8 +812,7 @@ def __call__( self.scheduler, num_inference_steps, device, - timesteps, - sigmas, + sigmas=sigmas, mu=mu, ) num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_control_img2img.py b/src/diffusers/pipelines/flux/pipeline_flux_control_img2img.py index ef20ab98ee2e..7001b19569f2 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_control_img2img.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_control_img2img.py @@ -647,7 +647,7 @@ def __call__( width: Optional[int] = None, strength: float = 0.6, num_inference_steps: int = 28, - timesteps: List[int] = None, + sigmas: Optional[List[float]] = None, guidance_scale: float = 7.0, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, @@ -698,10 +698,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -805,7 +805,7 @@ def __call__( ) # 4.Prepare timesteps - sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2) mu = calculate_shift( image_seq_len, @@ -818,8 +818,7 @@ def __call__( self.scheduler, num_inference_steps, device, - timesteps, - sigmas, + sigmas=sigmas, mu=mu, ) timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_controlnet.py b/src/diffusers/pipelines/flux/pipeline_flux_controlnet.py index ce7ea35c6cea..4c2d2a0a3db9 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_controlnet.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_controlnet.py @@ -602,7 +602,7 @@ def __call__( height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 28, - timesteps: List[int] = None, + sigmas: Optional[List[float]] = None, guidance_scale: float = 7.0, control_guidance_start: Union[float, List[float]] = 0.0, control_guidance_end: Union[float, List[float]] = 1.0, @@ -638,10 +638,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -872,7 +872,7 @@ def __call__( ) # 5. Prepare timesteps - sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas image_seq_len = latents.shape[1] mu = calculate_shift( image_seq_len, @@ -885,8 +885,7 @@ def __call__( self.scheduler, num_inference_steps, device, - timesteps, - sigmas, + sigmas=sigmas, mu=mu, ) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py b/src/diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py index 6ab34d8a9c08..4c82d73f0379 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py @@ -646,7 +646,7 @@ def __call__( width: Optional[int] = None, strength: float = 0.6, num_inference_steps: int = 28, - timesteps: List[int] = None, + sigmas: Optional[List[float]] = None, guidance_scale: float = 7.0, control_guidance_start: Union[float, List[float]] = 0.0, control_guidance_end: Union[float, List[float]] = 1.0, @@ -685,8 +685,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 28): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). control_mode (`int` or `List[int]`, *optional*): @@ -858,7 +860,7 @@ def __call__( control_mode = torch.tensor(control_mode_).to(device, dtype=torch.long) control_mode = control_mode.reshape([-1, 1]) - sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2) mu = calculate_shift( image_seq_len, @@ -871,8 +873,7 @@ def __call__( self.scheduler, num_inference_steps, device, - timesteps, - sigmas, + sigmas=sigmas, mu=mu, ) timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py b/src/diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py index d81cffaca35b..c557cf134b05 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_controlnet_inpainting.py @@ -752,7 +752,7 @@ def __call__( width: Optional[int] = None, strength: float = 0.6, padding_mask_crop: Optional[int] = None, - timesteps: List[int] = None, + sigmas: Optional[List[float]] = None, num_inference_steps: int = 28, guidance_scale: float = 7.0, control_guidance_start: Union[float, List[float]] = 0.0, @@ -799,8 +799,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 28): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0): @@ -1009,7 +1011,7 @@ def __call__( # 6. Prepare timesteps - sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas image_seq_len = (int(global_height) // self.vae_scale_factor // 2) * ( int(global_width) // self.vae_scale_factor // 2 ) @@ -1024,8 +1026,7 @@ def __call__( self.scheduler, num_inference_steps, device, - timesteps, - sigmas, + sigmas=sigmas, mu=mu, ) timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_fill.py b/src/diffusers/pipelines/flux/pipeline_flux_fill.py index 32b2bbefa709..723478ce724d 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_fill.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_fill.py @@ -689,7 +689,7 @@ def __call__( height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, - timesteps: List[int] = None, + sigmas: Optional[List[float]] = None, guidance_scale: float = 30.0, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, @@ -735,10 +735,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -878,7 +878,7 @@ def __call__( masked_image_latents = torch.cat((masked_image_latents, mask), dim=-1) # 6. Prepare timesteps - sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas image_seq_len = latents.shape[1] mu = calculate_shift( image_seq_len, @@ -891,8 +891,7 @@ def __call__( self.scheduler, num_inference_steps, device, - timesteps, - sigmas, + sigmas=sigmas, mu=mu, ) num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_img2img.py b/src/diffusers/pipelines/flux/pipeline_flux_img2img.py index d34d9b53aa6b..2b336fbdd472 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_img2img.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_img2img.py @@ -593,7 +593,7 @@ def __call__( width: Optional[int] = None, strength: float = 0.6, num_inference_steps: int = 28, - timesteps: List[int] = None, + sigmas: Optional[List[float]] = None, guidance_scale: float = 7.0, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, @@ -636,10 +636,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -742,7 +742,7 @@ def __call__( ) # 4.Prepare timesteps - sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2) mu = calculate_shift( image_seq_len, @@ -755,8 +755,7 @@ def __call__( self.scheduler, num_inference_steps, device, - timesteps, - sigmas, + sigmas=sigmas, mu=mu, ) timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_inpaint.py b/src/diffusers/pipelines/flux/pipeline_flux_inpaint.py index 3fcf6ace8a79..15abdb90ebd0 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_inpaint.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_inpaint.py @@ -693,7 +693,7 @@ def __call__( padding_mask_crop: Optional[int] = None, strength: float = 0.6, num_inference_steps: int = 28, - timesteps: List[int] = None, + sigmas: Optional[List[float]] = None, guidance_scale: float = 7.0, num_images_per_prompt: Optional[int] = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, @@ -753,10 +753,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -873,7 +873,7 @@ def __call__( ) # 4.Prepare timesteps - sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2) mu = calculate_shift( image_seq_len, @@ -886,8 +886,7 @@ def __call__( self.scheduler, num_inference_steps, device, - timesteps, - sigmas, + sigmas=sigmas, mu=mu, ) timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) From 922c5f5c3c2e887ac9832a9e460619005e0af8ae Mon Sep 17 00:00:00 2001 From: Parag Ekbote Date: Tue, 3 Dec 2024 00:20:00 +0530 Subject: [PATCH 07/38] Fixed Nits in Evaluation Docs (#10063) Minor fixes and script improvement in evaluation docs. --- docs/source/en/conceptual/evaluation.md | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/docs/source/en/conceptual/evaluation.md b/docs/source/en/conceptual/evaluation.md index 8dfbc8f2ac80..90e072bbf2ba 100644 --- a/docs/source/en/conceptual/evaluation.md +++ b/docs/source/en/conceptual/evaluation.md @@ -181,7 +181,7 @@ Then we load the [v1-5 checkpoint](https://huggingface.co/stable-diffusion-v1-5/ ```python model_ckpt_1_5 = "stable-diffusion-v1-5/stable-diffusion-v1-5" -sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype=weight_dtype).to(device) +sd_pipeline_1_5 = StableDiffusionPipeline.from_pretrained(model_ckpt_1_5, torch_dtype=torch.float16).to("cuda") images_1_5 = sd_pipeline_1_5(prompts, num_images_per_prompt=1, generator=generator, output_type="np").images ``` @@ -280,7 +280,7 @@ from diffusers import StableDiffusionInstructPix2PixPipeline instruct_pix2pix_pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained( "timbrooks/instruct-pix2pix", torch_dtype=torch.float16 -).to(device) +).to("cuda") ``` Now, we perform the edits: @@ -326,9 +326,9 @@ from transformers import ( clip_id = "openai/clip-vit-large-patch14" tokenizer = CLIPTokenizer.from_pretrained(clip_id) -text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to(device) +text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to("cuda") image_processor = CLIPImageProcessor.from_pretrained(clip_id) -image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to(device) +image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to("cuda") ``` Notice that we are using a particular CLIP checkpoint, i.e., `openai/clip-vit-large-patch14`. This is because the Stable Diffusion pre-training was performed with this CLIP variant. For more details, refer to the [documentation](https://huggingface.co/docs/transformers/model_doc/clip). @@ -350,7 +350,7 @@ class DirectionalSimilarity(nn.Module): def preprocess_image(self, image): image = self.image_processor(image, return_tensors="pt")["pixel_values"] - return {"pixel_values": image.to(device)} + return {"pixel_values": image.to("cuda")} def tokenize_text(self, text): inputs = self.tokenizer( @@ -360,7 +360,7 @@ class DirectionalSimilarity(nn.Module): truncation=True, return_tensors="pt", ) - return {"input_ids": inputs.input_ids.to(device)} + return {"input_ids": inputs.input_ids.to("cuda")} def encode_image(self, image): preprocessed_image = self.preprocess_image(image) @@ -459,6 +459,7 @@ with ZipFile(local_filepath, "r") as zipper: ```python from PIL import Image import os +import numpy as np dataset_path = "sample-imagenet-images" image_paths = sorted([os.path.join(dataset_path, x) for x in os.listdir(dataset_path)]) @@ -477,6 +478,7 @@ Now that the images are loaded, let's apply some lightweight pre-processing on t ```python from torchvision.transforms import functional as F +import torch def preprocess_image(image): @@ -498,6 +500,10 @@ dit_pipeline = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype= dit_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(dit_pipeline.scheduler.config) dit_pipeline = dit_pipeline.to("cuda") +seed = 0 +generator = torch.manual_seed(seed) + + words = [ "cassette player", "chainsaw", From c44fba889965638f447d20f5730745c7963494d7 Mon Sep 17 00:00:00 2001 From: ChG Date: Mon, 2 Dec 2024 11:45:12 -0800 Subject: [PATCH 08/38] fix link in the docs (#10058) * fix link in the docs * fix same issue for ko --- docs/source/en/training/create_dataset.md | 4 ++-- .../ko/api/pipelines/stable_diffusion/stable_diffusion_xl.md | 4 ++-- docs/source/ko/training/create_dataset.md | 2 +- docs/source/ko/training/lora.md | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/en/training/create_dataset.md b/docs/source/en/training/create_dataset.md index 38783eff76bd..f3221beb408f 100644 --- a/docs/source/en/training/create_dataset.md +++ b/docs/source/en/training/create_dataset.md @@ -1,6 +1,6 @@ # Create a dataset for training -There are many datasets on the [Hub](https://huggingface.co/datasets?task_categories=task_categories:text-to-image&sort=downloads) to train a model on, but if you can't find one you're interested in or want to use your own, you can create a dataset with the 🤗 [Datasets](hf.co/docs/datasets) library. The dataset structure depends on the task you want to train your model on. The most basic dataset structure is a directory of images for tasks like unconditional image generation. Another dataset structure may be a directory of images and a text file containing their corresponding text captions for tasks like text-to-image generation. +There are many datasets on the [Hub](https://huggingface.co/datasets?task_categories=task_categories:text-to-image&sort=downloads) to train a model on, but if you can't find one you're interested in or want to use your own, you can create a dataset with the 🤗 [Datasets](https://huggingface.co/docs/datasets) library. The dataset structure depends on the task you want to train your model on. The most basic dataset structure is a directory of images for tasks like unconditional image generation. Another dataset structure may be a directory of images and a text file containing their corresponding text captions for tasks like text-to-image generation. This guide will show you two ways to create a dataset to finetune on: @@ -87,4 +87,4 @@ accelerate launch --mixed_precision="fp16" train_text_to_image.py \ Now that you've created a dataset, you can plug it into the `train_data_dir` (if your dataset is local) or `dataset_name` (if your dataset is on the Hub) arguments of a training script. -For your next steps, feel free to try and use your dataset to train a model for [unconditional generation](unconditional_training) or [text-to-image generation](text2image)! \ No newline at end of file +For your next steps, feel free to try and use your dataset to train a model for [unconditional generation](unconditional_training) or [text-to-image generation](text2image)! diff --git a/docs/source/ko/api/pipelines/stable_diffusion/stable_diffusion_xl.md b/docs/source/ko/api/pipelines/stable_diffusion/stable_diffusion_xl.md index d7211d6b9471..d708dfa59dad 100644 --- a/docs/source/ko/api/pipelines/stable_diffusion/stable_diffusion_xl.md +++ b/docs/source/ko/api/pipelines/stable_diffusion/stable_diffusion_xl.md @@ -121,7 +121,7 @@ image = pipe(prompt=prompt, image=init_image, mask_image=mask_image, num_inferen ### 이미지 결과물을 정제하기 -[base 모델 체크포인트](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)에서, StableDiffusion-XL 또한 고주파 품질을 향상시키는 이미지를 생성하기 위해 낮은 노이즈 단계 이미지를 제거하는데 특화된 [refiner 체크포인트](huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0)를 포함하고 있습니다. 이 refiner 체크포인트는 이미지 품질을 향상시키기 위해 base 체크포인트를 실행한 후 "두 번째 단계" 파이프라인에 사용될 수 있습니다. +[base 모델 체크포인트](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)에서, StableDiffusion-XL 또한 고주파 품질을 향상시키는 이미지를 생성하기 위해 낮은 노이즈 단계 이미지를 제거하는데 특화된 [refiner 체크포인트](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0)를 포함하고 있습니다. 이 refiner 체크포인트는 이미지 품질을 향상시키기 위해 base 체크포인트를 실행한 후 "두 번째 단계" 파이프라인에 사용될 수 있습니다. refiner를 사용할 때, 쉽게 사용할 수 있습니다 - 1.) base 모델과 refiner을 사용하는데, 이는 *Denoisers의 앙상블*을 위한 첫 번째 제안된 [eDiff-I](https://research.nvidia.com/labs/dir/eDiff-I/)를 사용하거나 @@ -215,7 +215,7 @@ image = refiner( #### 2.) 노이즈가 완전히 제거된 기본 이미지에서 이미지 출력을 정제하기 -일반적인 [`StableDiffusionImg2ImgPipeline`] 방식에서, 기본 모델에서 생성된 완전히 노이즈가 제거된 이미지는 [refiner checkpoint](huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0)를 사용해 더 향상시킬 수 있습니다. +일반적인 [`StableDiffusionImg2ImgPipeline`] 방식에서, 기본 모델에서 생성된 완전히 노이즈가 제거된 이미지는 [refiner checkpoint](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0)를 사용해 더 향상시킬 수 있습니다. 이를 위해, 보통의 "base" text-to-image 파이프라인을 수행 후에 image-to-image 파이프라인으로써 refiner를 실행시킬 수 있습니다. base 모델의 출력을 잠재 공간에 남겨둘 수 있습니다. diff --git a/docs/source/ko/training/create_dataset.md b/docs/source/ko/training/create_dataset.md index 6987a6c9d4f0..401a73ebf237 100644 --- a/docs/source/ko/training/create_dataset.md +++ b/docs/source/ko/training/create_dataset.md @@ -1,7 +1,7 @@ # 학습을 위한 데이터셋 만들기 [Hub](https://huggingface.co/datasets?task_categories=task_categories:text-to-image&sort=downloads) 에는 모델 교육을 위한 많은 데이터셋이 있지만, -관심이 있거나 사용하고 싶은 데이터셋을 찾을 수 없는 경우 🤗 [Datasets](hf.co/docs/datasets) 라이브러리를 사용하여 데이터셋을 만들 수 있습니다. +관심이 있거나 사용하고 싶은 데이터셋을 찾을 수 없는 경우 🤗 [Datasets](https://huggingface.co/docs/datasets) 라이브러리를 사용하여 데이터셋을 만들 수 있습니다. 데이터셋 구조는 모델을 학습하려는 작업에 따라 달라집니다. 가장 기본적인 데이터셋 구조는 unconditional 이미지 생성과 같은 작업을 위한 이미지 디렉토리입니다. 또 다른 데이터셋 구조는 이미지 디렉토리와 text-to-image 생성과 같은 작업에 해당하는 텍스트 캡션이 포함된 텍스트 파일일 수 있습니다. diff --git a/docs/source/ko/training/lora.md b/docs/source/ko/training/lora.md index 6b905951aafc..85ed1dda0b81 100644 --- a/docs/source/ko/training/lora.md +++ b/docs/source/ko/training/lora.md @@ -36,7 +36,7 @@ specific language governing permissions and limitations under the License. [cloneofsimo](https://github.com/cloneofsimo)는 인기 있는 [lora](https://github.com/cloneofsimo/lora) GitHub 리포지토리에서 Stable Diffusion을 위한 LoRA 학습을 최초로 시도했습니다. 🧨 Diffusers는 [text-to-image 생성](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image#training-with-lora) 및 [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth#training-with-low-rank-adaptation-of-large-language-models-lora)을 지원합니다. 이 가이드는 두 가지를 모두 수행하는 방법을 보여줍니다. -모델을 저장하거나 커뮤니티와 공유하려면 Hugging Face 계정에 로그인하세요(아직 계정이 없는 경우 [생성](hf.co/join)하세요): +모델을 저장하거나 커뮤니티와 공유하려면 Hugging Face 계정에 로그인하세요(아직 계정이 없는 경우 [생성](https://huggingface.co/join)하세요): ```bash huggingface-cli login From cd344393e20f321ccb569fb893b227caf7d28235 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Mon, 2 Dec 2024 10:11:25 -1000 Subject: [PATCH 09/38] fix offloading for sd3.5 controlnets (#10072) * add --- .../models/controlnets/controlnet_sd3.py | 14 +++++++++++++ .../pipeline_stable_diffusion_3_controlnet.py | 21 ++++++++++++------- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/src/diffusers/models/controlnets/controlnet_sd3.py b/src/diffusers/models/controlnets/controlnet_sd3.py index 2a5fcf35498e..4f3253d82f3d 100644 --- a/src/diffusers/models/controlnets/controlnet_sd3.py +++ b/src/diffusers/models/controlnets/controlnet_sd3.py @@ -266,6 +266,20 @@ def _set_gradient_checkpointing(self, module, value=False): if hasattr(module, "gradient_checkpointing"): module.gradient_checkpointing = value + # Notes: This is for SD3.5 8b controlnet, which shares the pos_embed with the transformer + # we should have handled this in conversion script + def _get_pos_embed_from_transformer(self, transformer): + pos_embed = PatchEmbed( + height=transformer.config.sample_size, + width=transformer.config.sample_size, + patch_size=transformer.config.patch_size, + in_channels=transformer.config.in_channels, + embed_dim=transformer.inner_dim, + pos_embed_max_size=transformer.config.pos_embed_max_size, + ) + pos_embed.load_state_dict(transformer.pos_embed.state_dict(), strict=True) + return pos_embed + @classmethod def from_transformer( cls, transformer, num_layers=12, num_extra_conditioning_channels=1, load_weights_from_transformer=True diff --git a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py index b92dafffc715..8fd07fafc766 100644 --- a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +++ b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py @@ -194,6 +194,19 @@ def __init__( super().__init__() if isinstance(controlnet, (list, tuple)): controlnet = SD3MultiControlNetModel(controlnet) + if isinstance(controlnet, SD3MultiControlNetModel): + for controlnet_model in controlnet.nets: + # for SD3.5 8b controlnet, it shares the pos_embed with the transformer + if ( + hasattr(controlnet_model.config, "use_pos_embed") + and controlnet_model.config.use_pos_embed is False + ): + pos_embed = controlnet_model._get_pos_embed_from_transformer(transformer) + controlnet_model.pos_embed = pos_embed.to(controlnet_model.dtype).to(controlnet_model.device) + elif isinstance(controlnet, SD3ControlNetModel): + if hasattr(controlnet.config, "use_pos_embed") and controlnet.config.use_pos_embed is False: + pos_embed = controlnet._get_pos_embed_from_transformer(transformer) + controlnet.pos_embed = pos_embed.to(controlnet.dtype).to(controlnet.device) self.register_modules( vae=vae, @@ -1042,15 +1055,9 @@ def __call__( controlnet_cond_scale = controlnet_cond_scale[0] cond_scale = controlnet_cond_scale * controlnet_keep[i] - if controlnet_config.use_pos_embed is False: - # sd35 (offical) 8b controlnet - controlnet_model_input = self.transformer.pos_embed(latent_model_input) - else: - controlnet_model_input = latent_model_input - # controlnet(s) inference control_block_samples = self.controlnet( - hidden_states=controlnet_model_input, + hidden_states=latent_model_input, timestep=timestep, encoder_hidden_states=controlnet_encoder_hidden_states, pooled_projections=controlnet_pooled_projections, From a9d3f6c359caad00ae0c93d162d8b9a525776e0e Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 3 Dec 2024 02:46:16 +0530 Subject: [PATCH 10/38] [Single File] Fix SD3.5 single file loading (#10077) update --- src/diffusers/loaders/single_file_utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py index 9a460cb5d1ef..10742873ded1 100644 --- a/src/diffusers/loaders/single_file_utils.py +++ b/src/diffusers/loaders/single_file_utils.py @@ -127,6 +127,9 @@ "sd35_large": { "pretrained_model_name_or_path": "stabilityai/stable-diffusion-3.5-large", }, + "sd35_medium": { + "pretrained_model_name_or_path": "stabilityai/stable-diffusion-3.5-medium", + }, "animatediff_v1": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5"}, "animatediff_v2": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-2"}, "animatediff_v3": {"pretrained_model_name_or_path": "guoyww/animatediff-motion-adapter-v1-5-3"}, @@ -527,7 +530,10 @@ def infer_diffusers_model_type(checkpoint): model_type = "stable_cascade_stage_b" elif CHECKPOINT_KEY_NAMES["sd3"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["sd3"]].shape[-1] == 9216: - model_type = "sd3" + if checkpoint["model.diffusion_model.pos_embed"].shape[1] == 36864: + model_type = "sd3" + elif checkpoint["model.diffusion_model.pos_embed"].shape[1] == 147456: + model_type = "sd35_medium" elif CHECKPOINT_KEY_NAMES["sd35_large"] in checkpoint: model_type = "sd35_large" From beb856685ddb2000680115544da1babfd41a9d22 Mon Sep 17 00:00:00 2001 From: hlky Date: Mon, 2 Dec 2024 21:43:03 +0000 Subject: [PATCH 11/38] Fix `num_images_per_prompt>1` with Skip Guidance Layers in `StableDiffusion3Pipeline` (#10086) --- .../stable_diffusion_3/pipeline_stable_diffusion_3.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py index a77231cdc02d..aee1ad8c75f5 100644 --- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py @@ -907,11 +907,7 @@ def __call__( continue # expand the latents if we are doing classifier free guidance - latent_model_input = ( - torch.cat([latents] * 2) - if self.do_classifier_free_guidance and skip_guidance_layers is None - else latents - ) + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents # broadcast to batch dimension in a way that's compatible with ONNX/Core ML timestep = t.expand(latent_model_input.shape[0]) @@ -935,6 +931,8 @@ def __call__( else False ) if skip_guidance_layers is not None and should_skip_layers: + timestep = t.expand(latents.shape[0]) + latent_model_input = latents noise_pred_skip_layers = self.transformer( hidden_states=latent_model_input, timestep=timestep, From 6db33337a49c2f20db1b8d2ad069cca10c552c68 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 3 Dec 2024 03:25:36 +0530 Subject: [PATCH 12/38] [Single File] Pass token when fetching interpreted config (#10082) update --- src/diffusers/loaders/single_file_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py index 3fe1abfbead5..be3139057078 100644 --- a/src/diffusers/loaders/single_file_model.py +++ b/src/diffusers/loaders/single_file_model.py @@ -269,6 +269,7 @@ def from_single_file(cls, pretrained_model_link_or_path_or_dict: Optional[str] = pretrained_model_name_or_path=default_pretrained_model_config_name, subfolder=subfolder, local_files_only=local_files_only, + token=token, ) expected_kwargs, optional_kwargs = cls._get_signature_keys(cls) From 2312b27f796874658bc7391dd5d5c58b71dde153 Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Tue, 3 Dec 2024 00:33:56 +0100 Subject: [PATCH 13/38] Interpolate fix on cuda for large output tensors (#10067) * Workaround for upscale with large output tensors. Fixes #10040. * Fix scale when output_size is given * Style --------- Co-authored-by: Sayak Paul --- src/diffusers/models/upsampling.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/diffusers/models/upsampling.py b/src/diffusers/models/upsampling.py index cf07e45b0c5c..af04ae4b93cf 100644 --- a/src/diffusers/models/upsampling.py +++ b/src/diffusers/models/upsampling.py @@ -165,6 +165,14 @@ def forward(self, hidden_states: torch.Tensor, output_size: Optional[int] = None # if `output_size` is passed we force the interpolation output # size and do not make use of `scale_factor=2` if self.interpolate: + # upsample_nearest_nhwc also fails when the number of output elements is large + # https://github.com/pytorch/pytorch/issues/141831 + scale_factor = ( + 2 if output_size is None else max([f / s for f, s in zip(output_size, hidden_states.shape[-2:])]) + ) + if hidden_states.numel() * scale_factor > pow(2, 31): + hidden_states = hidden_states.contiguous() + if output_size is None: hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest") else: From 30f2e9bd202c89bb3862c8ada470d0d1ac8ee0e5 Mon Sep 17 00:00:00 2001 From: hlky Date: Tue, 3 Dec 2024 00:18:40 +0000 Subject: [PATCH 14/38] Convert `sigmas` to `np.array` in FlowMatch set_timesteps (#10088) --- src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py index d01071ec27b8..91264e805a0f 100644 --- a/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py +++ b/src/diffusers/schedulers/scheduling_flow_match_euler_discrete.py @@ -207,6 +207,7 @@ def set_timesteps( sigmas = timesteps / self.config.num_train_timesteps else: + sigmas = np.array(sigmas).astype(np.float32) num_inference_steps = len(sigmas) self.num_inference_steps = num_inference_steps From 963ffca43419a8dffa682d9e03c2299b76feeced Mon Sep 17 00:00:00 2001 From: Emmanuel Benazera Date: Tue, 3 Dec 2024 04:10:20 +0100 Subject: [PATCH 15/38] fix: missing AutoencoderKL lora adapter (#9807) * fix: missing AutoencoderKL lora adapter * fix --------- Co-authored-by: Sayak Paul --- .../models/autoencoders/autoencoder_kl.py | 3 +- tests/models/autoencoders/test_models_vae.py | 38 +++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py index 99a7da4a0b6f..9036c027a535 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl.py @@ -17,6 +17,7 @@ import torch.nn as nn from ...configuration_utils import ConfigMixin, register_to_config +from ...loaders import PeftAdapterMixin from ...loaders.single_file_model import FromOriginalModelMixin from ...utils import deprecate from ...utils.accelerate_utils import apply_forward_hook @@ -34,7 +35,7 @@ from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder -class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin): +class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin): r""" A VAE model with KL loss for encoding images into latents and decoding latent representations into images. diff --git a/tests/models/autoencoders/test_models_vae.py b/tests/models/autoencoders/test_models_vae.py index d29defbf6085..d475160cc796 100644 --- a/tests/models/autoencoders/test_models_vae.py +++ b/tests/models/autoencoders/test_models_vae.py @@ -36,7 +36,9 @@ backend_empty_cache, enable_full_determinism, floats_tensor, + is_peft_available, load_hf_numpy, + require_peft_backend, require_torch_accelerator, require_torch_accelerator_with_fp16, require_torch_gpu, @@ -50,6 +52,10 @@ from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin +if is_peft_available(): + from peft import LoraConfig + + enable_full_determinism() @@ -263,6 +269,38 @@ def test_output_pretrained(self): self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2)) + @require_peft_backend + def test_lora_adapter(self): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + vae = self.model_class(**init_dict) + + target_modules_vae = [ + "conv1", + "conv2", + "conv_in", + "conv_shortcut", + "conv", + "conv_out", + "skip_conv_1", + "skip_conv_2", + "skip_conv_3", + "skip_conv_4", + "to_k", + "to_q", + "to_v", + "to_out.0", + ] + vae_lora_config = LoraConfig( + r=16, + init_lora_weights="gaussian", + target_modules=target_modules_vae, + ) + + vae.add_adapter(vae_lora_config, adapter_name="vae_lora") + active_lora = vae.active_adapters() + self.assertTrue(len(active_lora) == 1) + self.assertTrue(active_lora[0] == "vae_lora") + class AsymmetricAutoencoderKLTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): model_class = AsymmetricAutoencoderKL From 0763a7edf4e9f2992f5ec8fb0c9dca8ab3e29f07 Mon Sep 17 00:00:00 2001 From: Lucain Date: Tue, 3 Dec 2024 04:15:46 +0100 Subject: [PATCH 16/38] Let server decide default repo visibility (#10047) --- docs/source/en/tutorials/basic_training.md | 2 +- docs/source/ko/tutorials/basic_training.md | 2 +- src/diffusers/configuration_utils.py | 2 +- src/diffusers/models/modeling_flax_utils.py | 2 +- src/diffusers/models/modeling_utils.py | 2 +- src/diffusers/pipelines/pipeline_flax_utils.py | 2 +- src/diffusers/pipelines/pipeline_utils.py | 2 +- src/diffusers/utils/hub_utils.py | 3 ++- 8 files changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/source/en/tutorials/basic_training.md b/docs/source/en/tutorials/basic_training.md index 402c8c59b17d..f8c4a5b84b9f 100644 --- a/docs/source/en/tutorials/basic_training.md +++ b/docs/source/en/tutorials/basic_training.md @@ -75,7 +75,7 @@ For convenience, create a `TrainingConfig` class containing the training hyperpa ... push_to_hub = True # whether to upload the saved model to the HF Hub ... hub_model_id = "/" # the name of the repository to create on the HF Hub -... hub_private_repo = False +... hub_private_repo = None ... overwrite_output_dir = True # overwrite the old model when re-running the notebook ... seed = 0 diff --git a/docs/source/ko/tutorials/basic_training.md b/docs/source/ko/tutorials/basic_training.md index f34507b50c9d..5b08bb39d602 100644 --- a/docs/source/ko/tutorials/basic_training.md +++ b/docs/source/ko/tutorials/basic_training.md @@ -76,7 +76,7 @@ huggingface-cli login ... output_dir = "ddpm-butterflies-128" # 로컬 및 HF Hub에 저장되는 모델명 ... push_to_hub = True # 저장된 모델을 HF Hub에 업로드할지 여부 -... hub_private_repo = False +... hub_private_repo = None ... overwrite_output_dir = True # 노트북을 다시 실행할 때 이전 모델에 덮어씌울지 ... seed = 0 diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py index 11d45dc64d97..d21ada6fbe60 100644 --- a/src/diffusers/configuration_utils.py +++ b/src/diffusers/configuration_utils.py @@ -170,7 +170,7 @@ def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool if push_to_hub: commit_message = kwargs.pop("commit_message", None) - private = kwargs.pop("private", False) + private = kwargs.pop("private", None) create_pr = kwargs.pop("create_pr", False) token = kwargs.pop("token", None) repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) diff --git a/src/diffusers/models/modeling_flax_utils.py b/src/diffusers/models/modeling_flax_utils.py index 8c35fab0fc16..1e61a56ec339 100644 --- a/src/diffusers/models/modeling_flax_utils.py +++ b/src/diffusers/models/modeling_flax_utils.py @@ -530,7 +530,7 @@ def save_pretrained( if push_to_hub: commit_message = kwargs.pop("commit_message", None) - private = kwargs.pop("private", False) + private = kwargs.pop("private", None) create_pr = kwargs.pop("create_pr", False) token = kwargs.pop("token", None) repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index 4a486fd4ce40..76f6c5f6309d 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -338,7 +338,7 @@ def save_pretrained( if push_to_hub: commit_message = kwargs.pop("commit_message", None) - private = kwargs.pop("private", False) + private = kwargs.pop("private", None) create_pr = kwargs.pop("create_pr", False) token = kwargs.pop("token", None) repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) diff --git a/src/diffusers/pipelines/pipeline_flax_utils.py b/src/diffusers/pipelines/pipeline_flax_utils.py index c4c212873a88..f7b101124181 100644 --- a/src/diffusers/pipelines/pipeline_flax_utils.py +++ b/src/diffusers/pipelines/pipeline_flax_utils.py @@ -180,7 +180,7 @@ class implements both a save and loading method. The pipeline is easily reloaded if push_to_hub: commit_message = kwargs.pop("commit_message", None) - private = kwargs.pop("private", False) + private = kwargs.pop("private", None) create_pr = kwargs.pop("create_pr", False) token = kwargs.pop("token", None) repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 2e1858b16148..a4faacb44914 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -229,7 +229,7 @@ class implements both a save and loading method. The pipeline is easily reloaded if push_to_hub: commit_message = kwargs.pop("commit_message", None) - private = kwargs.pop("private", False) + private = kwargs.pop("private", None) create_pr = kwargs.pop("create_pr", False) token = kwargs.pop("token", None) repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py index 448e92509732..ef4715ee0e1e 100644 --- a/src/diffusers/utils/hub_utils.py +++ b/src/diffusers/utils/hub_utils.py @@ -564,7 +564,8 @@ def push_to_hub( commit_message (`str`, *optional*): Message to commit while pushing. Default to `"Upload {object}"`. private (`bool`, *optional*): - Whether or not the repository created should be private. + Whether to make the repo private. If `None` (default), the repo will be public unless the + organization's default is private. This value is ignored if the repo already exists. token (`str`, *optional*): The token to use as HTTP bearer authorization for remote files. The token generated when running `huggingface-cli login` (stored in `~/.huggingface`). From fc72e0f2616ff993733eaa0310f0253646e0c525 Mon Sep 17 00:00:00 2001 From: DTG <68813178+DTG2005@users.noreply.github.com> Date: Tue, 3 Dec 2024 09:12:52 +0530 Subject: [PATCH 17/38] Fix some documentation in ./src/diffusers/models/embeddings.py for demo (#9579) * Fix some documentation in ./src/diffusers/models/embeddings.py as demonstration. --------- Co-authored-by: DaAccursed05 <68813178+DaAccursed05@users.noreply.github.com> Co-authored-by: Aryan Co-authored-by: Aryan Co-authored-by: YiYi Xu --- src/diffusers/models/embeddings.py | 110 +++++++++++++++++++++++++++-- 1 file changed, 105 insertions(+), 5 deletions(-) diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py index 80775d477c0d..91451fa9aac2 100644 --- a/src/diffusers/models/embeddings.py +++ b/src/diffusers/models/embeddings.py @@ -86,12 +86,25 @@ def get_3d_sincos_pos_embed( temporal_interpolation_scale: float = 1.0, ) -> np.ndarray: r""" + Creates 3D sinusoidal positional embeddings. + Args: embed_dim (`int`): + The embedding dimension of inputs. It must be divisible by 16. spatial_size (`int` or `Tuple[int, int]`): + The spatial dimension of positional embeddings. If an integer is provided, the same size is applied to both + spatial dimensions (height and width). temporal_size (`int`): + The temporal dimension of postional embeddings (number of frames). spatial_interpolation_scale (`float`, defaults to 1.0): + Scale factor for spatial grid interpolation. temporal_interpolation_scale (`float`, defaults to 1.0): + Scale factor for temporal grid interpolation. + + Returns: + `np.ndarray`: + The 3D sinusoidal positional embeddings of shape `[temporal_size, spatial_size[0] * spatial_size[1], + embed_dim]`. """ if embed_dim % 4 != 0: raise ValueError("`embed_dim` must be divisible by 4") @@ -129,8 +142,24 @@ def get_2d_sincos_pos_embed( embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16 ): """ - grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or - [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + Creates 2D sinusoidal positional embeddings. + + Args: + embed_dim (`int`): + The embedding dimension. + grid_size (`int`): + The size of the grid height and width. + cls_token (`bool`, defaults to `False`): + Whether or not to add a classification token. + extra_tokens (`int`, defaults to `0`): + The number of extra tokens to add. + interpolation_scale (`float`, defaults to `1.0`): + The scale of the interpolation. + + Returns: + pos_embed (`np.ndarray`): + Shape is either `[grid_size * grid_size, embed_dim]` if not using cls_token, or `[1 + grid_size*grid_size, + embed_dim]` if using cls_token """ if isinstance(grid_size, int): grid_size = (grid_size, grid_size) @@ -148,6 +177,16 @@ def get_2d_sincos_pos_embed( def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + r""" + This function generates 2D sinusoidal positional embeddings from a grid. + + Args: + embed_dim (`int`): The embedding dimension. + grid (`np.ndarray`): Grid of positions with shape `(H * W,)`. + + Returns: + `np.ndarray`: The 2D sinusoidal positional embeddings with shape `(H * W, embed_dim)` + """ if embed_dim % 2 != 0: raise ValueError("embed_dim must be divisible by 2") @@ -161,7 +200,14 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): """ - embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D) + This function generates 1D positional embeddings from a grid. + + Args: + embed_dim (`int`): The embedding dimension `D` + pos (`numpy.ndarray`): 1D tensor of positions with shape `(M,)` + + Returns: + `numpy.ndarray`: Sinusoidal positional embeddings of shape `(M, D)`. """ if embed_dim % 2 != 0: raise ValueError("embed_dim must be divisible by 2") @@ -181,7 +227,22 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): class PatchEmbed(nn.Module): - """2D Image to Patch Embedding with support for SD3 cropping.""" + """ + 2D Image to Patch Embedding with support for SD3 cropping. + + Args: + height (`int`, defaults to `224`): The height of the image. + width (`int`, defaults to `224`): The width of the image. + patch_size (`int`, defaults to `16`): The size of the patches. + in_channels (`int`, defaults to `3`): The number of input channels. + embed_dim (`int`, defaults to `768`): The output dimension of the embedding. + layer_norm (`bool`, defaults to `False`): Whether or not to use layer normalization. + flatten (`bool`, defaults to `True`): Whether or not to flatten the output. + bias (`bool`, defaults to `True`): Whether or not to use bias. + interpolation_scale (`float`, defaults to `1`): The scale of the interpolation. + pos_embed_type (`str`, defaults to `"sincos"`): The type of positional embedding. + pos_embed_max_size (`int`, defaults to `None`): The maximum size of the positional embedding. + """ def __init__( self, @@ -289,7 +350,15 @@ def forward(self, latent): class LuminaPatchEmbed(nn.Module): - """2D Image to Patch Embedding with support for Lumina-T2X""" + """ + 2D Image to Patch Embedding with support for Lumina-T2X + + Args: + patch_size (`int`, defaults to `2`): The size of the patches. + in_channels (`int`, defaults to `4`): The number of input channels. + embed_dim (`int`, defaults to `768`): The output dimension of the embedding. + bias (`bool`, defaults to `True`): Whether or not to use bias. + """ def __init__(self, patch_size=2, in_channels=4, embed_dim=768, bias=True): super().__init__() @@ -675,6 +744,20 @@ def get_2d_rotary_pos_embed(embed_dim, crops_coords, grid_size, use_real=True): def get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=False): + """ + Get 2D RoPE from grid. + + Args: + embed_dim: (`int`): + The embedding dimension size, corresponding to hidden_size_head. + grid (`np.ndarray`): + The grid of the positional embedding. + use_real (`bool`): + If True, return real part and imaginary part separately. Otherwise, return complex numbers. + + Returns: + `torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`. + """ assert embed_dim % 4 == 0 # use half of dimensions to encode grid_h @@ -695,6 +778,23 @@ def get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=False): def get_2d_rotary_pos_embed_lumina(embed_dim, len_h, len_w, linear_factor=1.0, ntk_factor=1.0): + """ + Get 2D RoPE from grid. + + Args: + embed_dim: (`int`): + The embedding dimension size, corresponding to hidden_size_head. + grid (`np.ndarray`): + The grid of the positional embedding. + linear_factor (`float`): + The linear factor of the positional embedding, which is used to scale the positional embedding in the linear + layer. + ntk_factor (`float`): + The ntk factor of the positional embedding, which is used to scale the positional embedding in the ntk layer. + + Returns: + `torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`. + """ assert embed_dim % 4 == 0 emb_h = get_1d_rotary_pos_embed( From acf79b3487b2df9c8782fe0275f06a7293610942 Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Tue, 3 Dec 2024 08:30:01 +0100 Subject: [PATCH 18/38] Don't stale close-to-merge (#10096) Re: https://github.com/huggingface/diffusers/discussions/10046#discussioncomment-11443466 --- utils/stale.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/stale.py b/utils/stale.py index c01b6d5682e9..20cb6cabeb91 100644 --- a/utils/stale.py +++ b/utils/stale.py @@ -24,6 +24,7 @@ LABELS_TO_EXEMPT = [ + "close-to-merge", "good first issue", "good second issue", "good difficult issue", From 63b631f38336f56755fb5cf15d9b0fb70bbf6323 Mon Sep 17 00:00:00 2001 From: Benjamin Paine <57536852+painebenjamin@users.noreply.github.com> Date: Tue, 3 Dec 2024 02:39:47 -0500 Subject: [PATCH 19/38] Add StableDiffusion3PAGImg2Img Pipeline + Fix SD3 Unconditional PAG (#9932) * fix progress bar updates in SD 1.5 PAG Img2Img pipeline --------- Co-authored-by: Vinh H. Pham Co-authored-by: Sayak Paul --- docs/source/en/api/pipelines/pag.md | 4 + src/diffusers/__init__.py | 2 + src/diffusers/models/attention_processor.py | 1 + src/diffusers/pipelines/__init__.py | 2 + src/diffusers/pipelines/auto_pipeline.py | 2 + src/diffusers/pipelines/pag/__init__.py | 2 + .../pag/pipeline_pag_sd_3_img2img.py | 1041 +++++++++++++++++ .../dummy_torch_and_transformers_objects.py | 15 + tests/pipelines/pag/test_pag_sd3_img2img.py | 276 +++++ 9 files changed, 1345 insertions(+) create mode 100644 src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py create mode 100644 tests/pipelines/pag/test_pag_sd3_img2img.py diff --git a/docs/source/en/api/pipelines/pag.md b/docs/source/en/api/pipelines/pag.md index cc6d075f457f..e723761f6fe0 100644 --- a/docs/source/en/api/pipelines/pag.md +++ b/docs/source/en/api/pipelines/pag.md @@ -96,6 +96,10 @@ Since RegEx is supported as a way for matching layer identifiers, it is crucial - all - __call__ +## StableDiffusion3PAGImg2ImgPipeline +[[autodoc]] StableDiffusion3PAGImg2ImgPipeline + - all + - __call__ ## PixArtSigmaPAGPipeline [[autodoc]] PixArtSigmaPAGPipeline diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index a4749af5f61b..6f70a8191629 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -339,6 +339,7 @@ "StableDiffusion3Img2ImgPipeline", "StableDiffusion3InpaintPipeline", "StableDiffusion3PAGPipeline", + "StableDiffusion3PAGImg2ImgPipeline", "StableDiffusion3Pipeline", "StableDiffusionAdapterPipeline", "StableDiffusionAttendAndExcitePipeline", @@ -807,6 +808,7 @@ StableDiffusion3ControlNetPipeline, StableDiffusion3Img2ImgPipeline, StableDiffusion3InpaintPipeline, + StableDiffusion3PAGImg2ImgPipeline, StableDiffusion3PAGPipeline, StableDiffusion3Pipeline, StableDiffusionAdapterPipeline, diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index ffbf4a0056c6..7351801368dd 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -1171,6 +1171,7 @@ def __call__( attn: Attention, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor = None, + attention_mask: Optional[torch.FloatTensor] = None, ) -> torch.FloatTensor: residual = hidden_states diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 5143b1114fd3..6d3a20511696 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -171,6 +171,7 @@ "KolorsPAGPipeline", "HunyuanDiTPAGPipeline", "StableDiffusion3PAGPipeline", + "StableDiffusion3PAGImg2ImgPipeline", "StableDiffusionPAGPipeline", "StableDiffusionPAGImg2ImgPipeline", "StableDiffusionControlNetPAGPipeline", @@ -589,6 +590,7 @@ HunyuanDiTPAGPipeline, KolorsPAGPipeline, PixArtSigmaPAGPipeline, + StableDiffusion3PAGImg2ImgPipeline, StableDiffusion3PAGPipeline, StableDiffusionControlNetPAGInpaintPipeline, StableDiffusionControlNetPAGPipeline, diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py index 0214d7dd6f3c..59ed10758a53 100644 --- a/src/diffusers/pipelines/auto_pipeline.py +++ b/src/diffusers/pipelines/auto_pipeline.py @@ -61,6 +61,7 @@ from .pag import ( HunyuanDiTPAGPipeline, PixArtSigmaPAGPipeline, + StableDiffusion3PAGImg2ImgPipeline, StableDiffusion3PAGPipeline, StableDiffusionControlNetPAGInpaintPipeline, StableDiffusionControlNetPAGPipeline, @@ -129,6 +130,7 @@ ("stable-diffusion", StableDiffusionImg2ImgPipeline), ("stable-diffusion-xl", StableDiffusionXLImg2ImgPipeline), ("stable-diffusion-3", StableDiffusion3Img2ImgPipeline), + ("stable-diffusion-3-pag", StableDiffusion3PAGImg2ImgPipeline), ("if", IFImg2ImgPipeline), ("kandinsky", KandinskyImg2ImgCombinedPipeline), ("kandinsky22", KandinskyV22Img2ImgCombinedPipeline), diff --git a/src/diffusers/pipelines/pag/__init__.py b/src/diffusers/pipelines/pag/__init__.py index 6a6723b58ca9..dfd823b0db27 100644 --- a/src/diffusers/pipelines/pag/__init__.py +++ b/src/diffusers/pipelines/pag/__init__.py @@ -31,6 +31,7 @@ _import_structure["pipeline_pag_pixart_sigma"] = ["PixArtSigmaPAGPipeline"] _import_structure["pipeline_pag_sd"] = ["StableDiffusionPAGPipeline"] _import_structure["pipeline_pag_sd_3"] = ["StableDiffusion3PAGPipeline"] + _import_structure["pipeline_pag_sd_3_img2img"] = ["StableDiffusion3PAGImg2ImgPipeline"] _import_structure["pipeline_pag_sd_animatediff"] = ["AnimateDiffPAGPipeline"] _import_structure["pipeline_pag_sd_img2img"] = ["StableDiffusionPAGImg2ImgPipeline"] _import_structure["pipeline_pag_sd_xl"] = ["StableDiffusionXLPAGPipeline"] @@ -54,6 +55,7 @@ from .pipeline_pag_pixart_sigma import PixArtSigmaPAGPipeline from .pipeline_pag_sd import StableDiffusionPAGPipeline from .pipeline_pag_sd_3 import StableDiffusion3PAGPipeline + from .pipeline_pag_sd_3_img2img import StableDiffusion3PAGImg2ImgPipeline from .pipeline_pag_sd_animatediff import AnimateDiffPAGPipeline from .pipeline_pag_sd_img2img import StableDiffusionPAGImg2ImgPipeline from .pipeline_pag_sd_xl import StableDiffusionXLPAGPipeline diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py new file mode 100644 index 000000000000..54e37e0fd286 --- /dev/null +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py @@ -0,0 +1,1041 @@ +# Copyright 2024 Stability AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Any, Callable, Dict, List, Optional, Union + +import PIL.Image +import torch +from transformers import ( + CLIPTextModelWithProjection, + CLIPTokenizer, + T5EncoderModel, + T5TokenizerFast, +) + +from ...image_processor import PipelineImageInput, VaeImageProcessor +from ...loaders import FromSingleFileMixin, SD3LoraLoaderMixin +from ...models.attention_processor import PAGCFGJointAttnProcessor2_0, PAGJointAttnProcessor2_0 +from ...models.autoencoders import AutoencoderKL +from ...models.transformers import SD3Transformer2DModel +from ...schedulers import FlowMatchEulerDiscreteScheduler +from ...utils import ( + USE_PEFT_BACKEND, + is_torch_xla_available, + logging, + replace_example_docstring, + scale_lora_layers, + unscale_lora_layers, +) +from ...utils.torch_utils import randn_tensor +from ..pipeline_utils import DiffusionPipeline +from ..stable_diffusion_3.pipeline_output import StableDiffusion3PipelineOutput +from .pag_utils import PAGMixin + + +if is_torch_xla_available(): + import torch_xla.core.xla_model as xm + + XLA_AVAILABLE = True +else: + XLA_AVAILABLE = False + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import StableDiffusion3PAGImg2ImgPipeline + >>> from diffusers.utils import load_image + + >>> pipe = StableDiffusion3PAGImg2ImgPipeline.from_pretrained( + ... "stabilityai/stable-diffusion-3-medium-diffusers", + ... torch_dtype=torch.float16, + ... pag_applied_layers=["blocks.13"], + ... ) + >>> pipe.to("cuda") + >>> prompt = "a photo of an astronaut riding a horse on mars" + >>> url = "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png" + >>> init_image = load_image(url).convert("RGB") + >>> image = pipe(prompt, image=init_image, guidance_scale=5.0, pag_scale=0.7).images[0] + ``` +""" + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents +def retrieve_latents( + encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample" +): + if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": + return encoder_output.latent_dist.sample(generator) + elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": + return encoder_output.latent_dist.mode() + elif hasattr(encoder_output, "latents"): + return encoder_output.latents + else: + raise AttributeError("Could not access latents of provided encoder_output") + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, + **kwargs, +): + r""" + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. + + Returns: + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +class StableDiffusion3PAGImg2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, PAGMixin): + r""" + [PAG pipeline](https://huggingface.co/docs/diffusers/main/en/using-diffusers/pag) for image-to-image generation + using Stable Diffusion 3. + + Args: + transformer ([`SD3Transformer2DModel`]): + Conditional Transformer (MMDiT) architecture to denoise the encoded image latents. + scheduler ([`FlowMatchEulerDiscreteScheduler`]): + A scheduler to be used in combination with `transformer` to denoise the encoded image latents. + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModelWithProjection`]): + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection), + specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant, + with an additional added projection layer that is initialized with a diagonal matrix with the `hidden_size` + as its dimension. + text_encoder_2 ([`CLIPTextModelWithProjection`]): + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection), + specifically the + [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k) + variant. + text_encoder_3 ([`T5EncoderModel`]): + Frozen text-encoder. Stable Diffusion 3 uses + [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the + [t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + tokenizer_2 (`CLIPTokenizer`): + Second Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + tokenizer_3 (`T5TokenizerFast`): + Tokenizer of class + [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer). + """ + + model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae" + _optional_components = [] + _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"] + + def __init__( + self, + transformer: SD3Transformer2DModel, + scheduler: FlowMatchEulerDiscreteScheduler, + vae: AutoencoderKL, + text_encoder: CLIPTextModelWithProjection, + tokenizer: CLIPTokenizer, + text_encoder_2: CLIPTextModelWithProjection, + tokenizer_2: CLIPTokenizer, + text_encoder_3: T5EncoderModel, + tokenizer_3: T5TokenizerFast, + pag_applied_layers: Union[str, List[str]] = "blocks.1", # 1st transformer block + ): + super().__init__() + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + text_encoder_2=text_encoder_2, + text_encoder_3=text_encoder_3, + tokenizer=tokenizer, + tokenizer_2=tokenizer_2, + tokenizer_3=tokenizer_3, + transformer=transformer, + scheduler=scheduler, + ) + self.vae_scale_factor = ( + 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8 + ) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.tokenizer_max_length = ( + self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77 + ) + self.default_sample_size = ( + self.transformer.config.sample_size + if hasattr(self, "transformer") and self.transformer is not None + else 128 + ) + self.patch_size = ( + self.transformer.config.patch_size if hasattr(self, "transformer") and self.transformer is not None else 2 + ) + + self.set_pag_applied_layers( + pag_applied_layers, pag_attn_processors=(PAGCFGJointAttnProcessor2_0(), PAGJointAttnProcessor2_0()) + ) + + # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_t5_prompt_embeds + def _get_t5_prompt_embeds( + self, + prompt: Union[str, List[str]] = None, + num_images_per_prompt: int = 1, + max_sequence_length: int = 256, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ): + device = device or self._execution_device + dtype = dtype or self.text_encoder.dtype + + prompt = [prompt] if isinstance(prompt, str) else prompt + batch_size = len(prompt) + + if self.text_encoder_3 is None: + return torch.zeros( + ( + batch_size * num_images_per_prompt, + self.tokenizer_max_length, + self.transformer.config.joint_attention_dim, + ), + device=device, + dtype=dtype, + ) + + text_inputs = self.tokenizer_3( + prompt, + padding="max_length", + max_length=max_sequence_length, + truncation=True, + add_special_tokens=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer_3(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): + removed_text = self.tokenizer_3.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because `max_sequence_length` is set to " + f" {max_sequence_length} tokens: {removed_text}" + ) + + prompt_embeds = self.text_encoder_3(text_input_ids.to(device))[0] + + dtype = self.text_encoder_3.dtype + prompt_embeds = prompt_embeds.to(dtype=dtype, device=device) + + _, seq_len, _ = prompt_embeds.shape + + # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + return prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline._get_clip_prompt_embeds + def _get_clip_prompt_embeds( + self, + prompt: Union[str, List[str]], + num_images_per_prompt: int = 1, + device: Optional[torch.device] = None, + clip_skip: Optional[int] = None, + clip_model_index: int = 0, + ): + device = device or self._execution_device + + clip_tokenizers = [self.tokenizer, self.tokenizer_2] + clip_text_encoders = [self.text_encoder, self.text_encoder_2] + + tokenizer = clip_tokenizers[clip_model_index] + text_encoder = clip_text_encoders[clip_model_index] + + prompt = [prompt] if isinstance(prompt, str) else prompt + batch_size = len(prompt) + + text_inputs = tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer_max_length, + truncation=True, + return_tensors="pt", + ) + + text_input_ids = text_inputs.input_ids + untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): + removed_text = tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer_max_length} tokens: {removed_text}" + ) + prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True) + pooled_prompt_embeds = prompt_embeds[0] + + if clip_skip is None: + prompt_embeds = prompt_embeds.hidden_states[-2] + else: + prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)] + + prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + _, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1) + pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1) + + return prompt_embeds, pooled_prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.encode_prompt + def encode_prompt( + self, + prompt: Union[str, List[str]], + prompt_2: Union[str, List[str]], + prompt_3: Union[str, List[str]], + device: Optional[torch.device] = None, + num_images_per_prompt: int = 1, + do_classifier_free_guidance: bool = True, + negative_prompt: Optional[Union[str, List[str]]] = None, + negative_prompt_2: Optional[Union[str, List[str]]] = None, + negative_prompt_3: Optional[Union[str, List[str]]] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + clip_skip: Optional[int] = None, + max_sequence_length: int = 256, + lora_scale: Optional[float] = None, + ): + r""" + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is + used in all text-encoders + prompt_3 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to the `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is + used in all text-encoders + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + negative_prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and + `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders. + negative_prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and + `text_encoder_3`. If not defined, `negative_prompt` is used in both text-encoders + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided, pooled text embeddings will be generated from `prompt` input argument. + negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` + input argument. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + """ + device = device or self._execution_device + + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin): + self._lora_scale = lora_scale + + # dynamically adjust the LoRA scale + if self.text_encoder is not None and USE_PEFT_BACKEND: + scale_lora_layers(self.text_encoder, lora_scale) + if self.text_encoder_2 is not None and USE_PEFT_BACKEND: + scale_lora_layers(self.text_encoder_2, lora_scale) + + prompt = [prompt] if isinstance(prompt, str) else prompt + if prompt is not None: + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + prompt_2 = prompt_2 or prompt + prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2 + + prompt_3 = prompt_3 or prompt + prompt_3 = [prompt_3] if isinstance(prompt_3, str) else prompt_3 + + prompt_embed, pooled_prompt_embed = self._get_clip_prompt_embeds( + prompt=prompt, + device=device, + num_images_per_prompt=num_images_per_prompt, + clip_skip=clip_skip, + clip_model_index=0, + ) + prompt_2_embed, pooled_prompt_2_embed = self._get_clip_prompt_embeds( + prompt=prompt_2, + device=device, + num_images_per_prompt=num_images_per_prompt, + clip_skip=clip_skip, + clip_model_index=1, + ) + clip_prompt_embeds = torch.cat([prompt_embed, prompt_2_embed], dim=-1) + + t5_prompt_embed = self._get_t5_prompt_embeds( + prompt=prompt_3, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + device=device, + ) + + clip_prompt_embeds = torch.nn.functional.pad( + clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1]) + ) + + prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2) + pooled_prompt_embeds = torch.cat([pooled_prompt_embed, pooled_prompt_2_embed], dim=-1) + + if do_classifier_free_guidance and negative_prompt_embeds is None: + negative_prompt = negative_prompt or "" + negative_prompt_2 = negative_prompt_2 or negative_prompt + negative_prompt_3 = negative_prompt_3 or negative_prompt + + # normalize str to list + negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt + negative_prompt_2 = ( + batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2 + ) + negative_prompt_3 = ( + batch_size * [negative_prompt_3] if isinstance(negative_prompt_3, str) else negative_prompt_3 + ) + + if prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + + negative_prompt_embed, negative_pooled_prompt_embed = self._get_clip_prompt_embeds( + negative_prompt, + device=device, + num_images_per_prompt=num_images_per_prompt, + clip_skip=None, + clip_model_index=0, + ) + negative_prompt_2_embed, negative_pooled_prompt_2_embed = self._get_clip_prompt_embeds( + negative_prompt_2, + device=device, + num_images_per_prompt=num_images_per_prompt, + clip_skip=None, + clip_model_index=1, + ) + negative_clip_prompt_embeds = torch.cat([negative_prompt_embed, negative_prompt_2_embed], dim=-1) + + t5_negative_prompt_embed = self._get_t5_prompt_embeds( + prompt=negative_prompt_3, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + device=device, + ) + + negative_clip_prompt_embeds = torch.nn.functional.pad( + negative_clip_prompt_embeds, + (0, t5_negative_prompt_embed.shape[-1] - negative_clip_prompt_embeds.shape[-1]), + ) + + negative_prompt_embeds = torch.cat([negative_clip_prompt_embeds, t5_negative_prompt_embed], dim=-2) + negative_pooled_prompt_embeds = torch.cat( + [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1 + ) + + if self.text_encoder is not None: + if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder, lora_scale) + + if self.text_encoder_2 is not None: + if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder_2, lora_scale) + + return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.check_inputs + def check_inputs( + self, + prompt, + prompt_2, + prompt_3, + strength, + negative_prompt=None, + negative_prompt_2=None, + negative_prompt_3=None, + prompt_embeds=None, + negative_prompt_embeds=None, + pooled_prompt_embeds=None, + negative_pooled_prompt_embeds=None, + callback_on_step_end_tensor_inputs=None, + max_sequence_length=None, + ): + if strength < 0 or strength > 1: + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") + + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt_2 is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt_3 is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt_3`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)): + raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}") + elif prompt_3 is not None and (not isinstance(prompt_3, str) and not isinstance(prompt_3, list)): + raise ValueError(f"`prompt_3` has to be of type `str` or `list` but is {type(prompt_3)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + elif negative_prompt_2 is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + elif negative_prompt_3 is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt_3`: {negative_prompt_3} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + if prompt_embeds is not None and pooled_prompt_embeds is None: + raise ValueError( + "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`." + ) + + if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None: + raise ValueError( + "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`." + ) + + if max_sequence_length is not None and max_sequence_length > 512: + raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}") + + # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min(num_inference_steps * strength, num_inference_steps) + + t_start = int(max(num_inference_steps - init_timestep, 0)) + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) + + return timesteps, num_inference_steps - t_start + + # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.prepare_latents + def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): + if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)): + raise ValueError( + f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" + ) + + image = image.to(device=device, dtype=dtype) + + batch_size = batch_size * num_images_per_prompt + if image.shape[1] == self.vae.config.latent_channels: + init_latents = image + + else: + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + elif isinstance(generator, list): + init_latents = [ + retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) + for i in range(batch_size) + ] + init_latents = torch.cat(init_latents, dim=0) + else: + init_latents = retrieve_latents(self.vae.encode(image), generator=generator) + + init_latents = (init_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor + + if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: + # expand init_latents for batch_size + additional_image_per_prompt = batch_size // init_latents.shape[0] + init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0) + elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." + ) + else: + init_latents = torch.cat([init_latents], dim=0) + + shape = init_latents.shape + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + + # get latents + init_latents = self.scheduler.scale_noise(init_latents, timestep, noise) + latents = init_latents.to(device=device, dtype=dtype) + + return latents + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def clip_skip(self): + return self._clip_skip + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + @property + def do_classifier_free_guidance(self): + return self._guidance_scale > 1 + + @property + def joint_attention_kwargs(self): + return self._joint_attention_kwargs + + @property + def num_timesteps(self): + return self._num_timesteps + + @property + def interrupt(self): + return self._interrupt + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + prompt_2: Optional[Union[str, List[str]]] = None, + prompt_3: Optional[Union[str, List[str]]] = None, + image: PipelineImageInput = None, + strength: float = 0.6, + num_inference_steps: int = 50, + timesteps: List[int] = None, + guidance_scale: float = 7.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + negative_prompt_2: Optional[Union[str, List[str]]] = None, + negative_prompt_3: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + joint_attention_kwargs: Optional[Dict[str, Any]] = None, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + max_sequence_length: int = 256, + pag_scale: float = 3.0, + pag_adaptive_scale: float = 0.0, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is + will be used instead + prompt_3 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is + will be used instead + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both + numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list + or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a + list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image + latents as `image`, but if passing latents directly it is not encoded again. + strength (`float`, *optional*, defaults to 0.8): + Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a + starting point and more noise is added the higher the `strength`. The number of denoising steps depends + on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising + process runs for the full number of iterations specified in `num_inference_steps`. A value of 1 + essentially ignores `image`. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + guidance_scale (`float`, *optional*, defaults to 7.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + negative_prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and + `text_encoder_2`. If not defined, `negative_prompt` is used instead + negative_prompt_3 (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and + `text_encoder_3`. If not defined, `negative_prompt` is used instead + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided, pooled text embeddings will be generated from `prompt` input argument. + negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` + input argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead + of a plain tuple. + joint_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`. + pag_scale (`float`, *optional*, defaults to 3.0): + The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention + guidance will not be used. + pag_adaptive_scale (`float`, *optional*, defaults to 0.0): + The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is + used. + + Examples: + + Returns: + [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a + `tuple`. When returning a tuple, the first element is a list with the generated images. + """ + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + prompt_2, + prompt_3, + strength, + negative_prompt=negative_prompt, + negative_prompt_2=negative_prompt_2, + negative_prompt_3=negative_prompt_3, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, + callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs, + max_sequence_length=max_sequence_length, + ) + + self._guidance_scale = guidance_scale + self._clip_skip = clip_skip + self._joint_attention_kwargs = joint_attention_kwargs + self._interrupt = False + self._pag_scale = pag_scale + self._pag_adaptive_scale = pag_adaptive_scale + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + lora_scale = ( + self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None + ) + ( + prompt_embeds, + negative_prompt_embeds, + pooled_prompt_embeds, + negative_pooled_prompt_embeds, + ) = self.encode_prompt( + prompt=prompt, + prompt_2=prompt_2, + prompt_3=prompt_3, + negative_prompt=negative_prompt, + negative_prompt_2=negative_prompt_2, + negative_prompt_3=negative_prompt_3, + do_classifier_free_guidance=self.do_classifier_free_guidance, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, + device=device, + clip_skip=self.clip_skip, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + lora_scale=lora_scale, + ) + + if self.do_perturbed_attention_guidance: + prompt_embeds = self._prepare_perturbed_attention_guidance( + prompt_embeds, negative_prompt_embeds, self.do_classifier_free_guidance + ) + pooled_prompt_embeds = self._prepare_perturbed_attention_guidance( + pooled_prompt_embeds, negative_pooled_prompt_embeds, self.do_classifier_free_guidance + ) + elif self.do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) + pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0) + + # 3. Preprocess image + image = self.image_processor.preprocess(image) + + # 4. Prepare timesteps + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) + latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + # 5. Prepare latent variables + num_channels_latents = self.transformer.config.in_channels + if latents is None: + latents = self.prepare_latents( + image, + latent_timestep, + batch_size, + num_images_per_prompt, + prompt_embeds.dtype, + device, + generator, + ) + + if self.do_perturbed_attention_guidance: + original_attn_proc = self.transformer.attn_processors + self._set_pag_attn_processor( + pag_applied_layers=self.pag_applied_layers, + do_classifier_free_guidance=self.do_classifier_free_guidance, + ) + + # 6. Denoising loop + num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) + self._num_timesteps = len(timesteps) + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + # expand the latents if we are doing classifier free guidance, perturbed-attention guidance, or both + latent_model_input = torch.cat([latents] * (prompt_embeds.shape[0] // latents.shape[0])) + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timestep = t.expand(latent_model_input.shape[0]) + + noise_pred = self.transformer( + hidden_states=latent_model_input, + timestep=timestep, + encoder_hidden_states=prompt_embeds, + pooled_projections=pooled_prompt_embeds, + joint_attention_kwargs=self.joint_attention_kwargs, + return_dict=False, + )[0] + + # perform guidance + if self.do_perturbed_attention_guidance: + noise_pred = self._apply_perturbed_attention_guidance( + noise_pred, self.do_classifier_free_guidance, self.guidance_scale, t + ) + + elif self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents_dtype = latents.dtype + latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + + if latents.dtype != latents_dtype: + if torch.backends.mps.is_available(): + # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272 + latents = latents.to(latents_dtype) + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + negative_pooled_prompt_embeds = callback_outputs.pop( + "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds + ) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + + if XLA_AVAILABLE: + xm.mark_step() + + if output_type == "latent": + image = latents + + else: + latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor + + image = self.vae.decode(latents, return_dict=False)[0] + image = self.image_processor.postprocess(image, output_type=output_type) + + # Offload all models + self.maybe_free_model_hooks() + + if self.do_perturbed_attention_guidance: + self.transformer.set_attn_processor(original_attn_proc) + + if not return_dict: + return (image,) + + return StableDiffusion3PipelineOutput(images=image) diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index b76ea3824060..4fc7cd6aefff 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -1397,6 +1397,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class StableDiffusion3PAGImg2ImgPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class StableDiffusion3PAGPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/tests/pipelines/pag/test_pag_sd3_img2img.py b/tests/pipelines/pag/test_pag_sd3_img2img.py new file mode 100644 index 000000000000..bffcd254e2c5 --- /dev/null +++ b/tests/pipelines/pag/test_pag_sd3_img2img.py @@ -0,0 +1,276 @@ +import gc +import inspect +import random +import unittest + +import numpy as np +import torch +from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel + +from diffusers import ( + AutoencoderKL, + AutoPipelineForImage2Image, + FlowMatchEulerDiscreteScheduler, + SD3Transformer2DModel, + StableDiffusion3Img2ImgPipeline, + StableDiffusion3PAGImg2ImgPipeline, +) +from diffusers.utils.testing_utils import ( + enable_full_determinism, + floats_tensor, + load_image, + require_torch_gpu, + slow, + torch_device, +) + +from ..pipeline_params import ( + IMAGE_TO_IMAGE_IMAGE_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, + TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS, +) +from ..test_pipelines_common import ( + PipelineTesterMixin, +) + + +enable_full_determinism() + + +class StableDiffusion3PAGImg2ImgPipelineFastTests(unittest.TestCase, PipelineTesterMixin): + pipeline_class = StableDiffusion3PAGImg2ImgPipeline + params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS.union({"pag_scale", "pag_adaptive_scale"}) - {"height", "width"} + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} + batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS + image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS + image_latens_params = IMAGE_TO_IMAGE_IMAGE_PARAMS + callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS + + test_xformers_attention = False + + def get_dummy_components(self): + torch.manual_seed(0) + transformer = SD3Transformer2DModel( + sample_size=32, + patch_size=1, + in_channels=4, + num_layers=2, + attention_head_dim=8, + num_attention_heads=4, + caption_projection_dim=32, + joint_attention_dim=32, + pooled_projection_dim=64, + out_channels=4, + ) + clip_text_encoder_config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + hidden_act="gelu", + projection_dim=32, + ) + + torch.manual_seed(0) + text_encoder = CLIPTextModelWithProjection(clip_text_encoder_config) + + torch.manual_seed(0) + text_encoder_2 = CLIPTextModelWithProjection(clip_text_encoder_config) + + text_encoder_3 = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5") + + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + tokenizer_3 = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5") + + torch.manual_seed(0) + vae = AutoencoderKL( + sample_size=32, + in_channels=3, + out_channels=3, + block_out_channels=(4,), + layers_per_block=1, + latent_channels=4, + norm_num_groups=1, + use_quant_conv=False, + use_post_quant_conv=False, + shift_factor=0.0609, + scaling_factor=1.5035, + ) + + scheduler = FlowMatchEulerDiscreteScheduler() + + return { + "scheduler": scheduler, + "text_encoder": text_encoder, + "text_encoder_2": text_encoder_2, + "text_encoder_3": text_encoder_3, + "tokenizer": tokenizer, + "tokenizer_2": tokenizer_2, + "tokenizer_3": tokenizer_3, + "transformer": transformer, + "vae": vae, + } + + def get_dummy_inputs(self, device, seed=0): + image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) + image = image / 2 + 0.5 + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device="cpu").manual_seed(seed) + + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": image, + "generator": generator, + "num_inference_steps": 2, + "guidance_scale": 5.0, + "output_type": "np", + "pag_scale": 0.7, + } + return inputs + + def test_pag_disable_enable(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + + # base pipeline (expect same output when pag is disabled) + pipe_sd = StableDiffusion3Img2ImgPipeline(**components) + pipe_sd = pipe_sd.to(device) + pipe_sd.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + del inputs["pag_scale"] + assert ( + "pag_scale" not in inspect.signature(pipe_sd.__call__).parameters + ), f"`pag_scale` should not be a call parameter of the base pipeline {pipe_sd.__class__.__name__}." + out = pipe_sd(**inputs).images[0, -3:, -3:, -1] + + components = self.get_dummy_components() + + # pag disabled with pag_scale=0.0 + pipe_pag = self.pipeline_class(**components) + pipe_pag = pipe_pag.to(device) + pipe_pag.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + inputs["pag_scale"] = 0.0 + out_pag_disabled = pipe_pag(**inputs).images[0, -3:, -3:, -1] + + assert np.abs(out.flatten() - out_pag_disabled.flatten()).max() < 1e-3 + + def test_pag_inference(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + + pipe_pag = self.pipeline_class(**components, pag_applied_layers=["blocks.0"]) + pipe_pag = pipe_pag.to(device) + pipe_pag.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + image = pipe_pag(**inputs).images + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == ( + 1, + 32, + 32, + 3, + ), f"the shape of the output image should be (1, 32, 32, 3) but got {image.shape}" + + expected_slice = np.array( + [0.66063476, 0.44838923, 0.5484299, 0.7242875, 0.5970012, 0.6015729, 0.53080845, 0.52220416, 0.56397927] + ) + max_diff = np.abs(image_slice.flatten() - expected_slice).max() + self.assertLessEqual(max_diff, 1e-3) + + +@slow +@require_torch_gpu +class StableDiffusion3PAGImg2ImgPipelineIntegrationTests(unittest.TestCase): + pipeline_class = StableDiffusion3PAGImg2ImgPipeline + repo_id = "stabilityai/stable-diffusion-3-medium-diffusers" + + def setUp(self): + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs( + self, device, generator_device="cpu", dtype=torch.float32, seed=0, guidance_scale=7.0, pag_scale=0.7 + ): + img_url = ( + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-text2img.png" + ) + init_image = load_image(img_url) + + generator = torch.Generator(device=generator_device).manual_seed(seed) + inputs = { + "prompt": "an astronaut in a space suit walking through a jungle", + "generator": generator, + "image": init_image, + "num_inference_steps": 12, + "strength": 0.6, + "guidance_scale": guidance_scale, + "pag_scale": pag_scale, + "output_type": "np", + } + return inputs + + def test_pag_cfg(self): + pipeline = AutoPipelineForImage2Image.from_pretrained( + self.repo_id, enable_pag=True, torch_dtype=torch.float16, pag_applied_layers=["blocks.17"] + ) + pipeline.enable_model_cpu_offload() + pipeline.set_progress_bar_config(disable=None) + + inputs = self.get_inputs(torch_device) + image = pipeline(**inputs).images + image_slice = image[0, -3:, -3:, -1].flatten() + assert image.shape == (1, 1024, 1024, 3) + expected_slice = np.array( + [ + 0.16772461, + 0.17626953, + 0.18432617, + 0.17822266, + 0.18359375, + 0.17626953, + 0.17407227, + 0.17700195, + 0.17822266, + ] + ) + assert ( + np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 + ), f"output is different from expected, {image_slice.flatten()}" + + def test_pag_uncond(self): + pipeline = AutoPipelineForImage2Image.from_pretrained( + self.repo_id, enable_pag=True, torch_dtype=torch.float16, pag_applied_layers=["blocks.(4|17)"] + ) + pipeline.enable_model_cpu_offload() + pipeline.set_progress_bar_config(disable=None) + + inputs = self.get_inputs(torch_device, guidance_scale=0.0, pag_scale=1.8) + image = pipeline(**inputs).images + image_slice = image[0, -3:, -3:, -1].flatten() + assert image.shape == (1, 1024, 1024, 3) + expected_slice = np.array( + [0.1508789, 0.16210938, 0.17138672, 0.16210938, 0.17089844, 0.16137695, 0.16235352, 0.16430664, 0.16455078] + ) + assert ( + np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 + ), f"output is different from expected, {image_slice.flatten()}" From cf258948b2a9b1645cc2f61dc017c28cec29b101 Mon Sep 17 00:00:00 2001 From: Parag Ekbote Date: Tue, 3 Dec 2024 23:53:00 +0530 Subject: [PATCH 20/38] Notebooks for Community Scripts-4 (#10094) * Add Diffuser Notebooks for Community Scripts. * Add missing link. * Styling Improvement. --- examples/community/README.md | 22 ++++++++++++------- .../community/README_community_scripts.md | 6 ++--- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/examples/community/README.md b/examples/community/README.md index 653355fe19a4..611a278af88e 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -11,7 +11,7 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif | Example | Description | Code Example | Colab | Author | |:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:| |Adaptive Mask Inpainting|Adaptive Mask Inpainting algorithm from [Beyond the Contact: Discovering Comprehensive Affordance for 3D Objects from Pre-trained 2D Diffusion Models](https://github.com/snuvclab/coma) (ECCV '24, Oral) provides a way to insert human inside the scene image without altering the background, by inpainting with adapting mask.|[Adaptive Mask Inpainting](#adaptive-mask-inpainting)|-|[Hyeonwoo Kim](https://sshowbiz.xyz),[Sookwan Han](https://jellyheadandrew.github.io)| -|Flux with CFG|[Flux with CFG](https://github.com/ToTheBeginning/PuLID/blob/main/docs/pulid_for_flux.md) provides an implementation of using CFG in [Flux](https://blackforestlabs.ai/announcing-black-forest-labs/).|[Flux with CFG](#flux-with-cfg)|NA|[Linoy Tsaban](https://github.com/linoytsaban), [Apolinário](https://github.com/apolinario), and [Sayak Paul](https://github.com/sayakpaul)| +|Flux with CFG|[Flux with CFG](https://github.com/ToTheBeginning/PuLID/blob/main/docs/pulid_for_flux.md) provides an implementation of using CFG in [Flux](https://blackforestlabs.ai/announcing-black-forest-labs/).|[Flux with CFG](#flux-with-cfg)|[Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/flux_with_cfg.ipynb)|[Linoy Tsaban](https://github.com/linoytsaban), [Apolinário](https://github.com/apolinario), and [Sayak Paul](https://github.com/sayakpaul)| |Differential Diffusion|[Differential Diffusion](https://github.com/exx8/differential-diffusion) modifies an image according to a text prompt, and according to a map that specifies the amount of change in each region.|[Differential Diffusion](#differential-diffusion)|[![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/exx8/differential-diffusion) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/exx8/differential-diffusion/blob/main/examples/SD2.ipynb)|[Eran Levin](https://github.com/exx8) and [Ohad Fried](https://www.ohadf.com/)| | HD-Painter | [HD-Painter](https://github.com/Picsart-AI-Research/HD-Painter) enables prompt-faithfull and high resolution (up to 2k) image inpainting upon any diffusion-based image inpainting method. | [HD-Painter](#hd-painter) | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/PAIR/HD-Painter) | [Manukyan Hayk](https://github.com/haikmanukyan) and [Sargsyan Andranik](https://github.com/AndranikSargsyan) | | Marigold Monocular Depth Estimation | A universal monocular depth estimator, utilizing Stable Diffusion, delivering sharp predictions in the wild. (See the [project page](https://marigoldmonodepth.github.io) and [full codebase](https://github.com/prs-eth/marigold) for more details.) | [Marigold Depth Estimation](#marigold-depth-estimation) | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/toshas/marigold) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/12G8reD13DdpMie5ZQlaFNo2WCGeNUH-u?usp=sharing) | [Bingxin Ke](https://github.com/markkua) and [Anton Obukhov](https://github.com/toshas) | @@ -26,7 +26,7 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif | [Composable Stable Diffusion](https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/) | Stable Diffusion Pipeline that supports prompts that contain "|" in prompts (as an AND condition) and weights (separated by "|" as well) to positively / negatively weight prompts. | [Composable Stable Diffusion](#composable-stable-diffusion) | - | [Mark Rich](https://github.com/MarkRich) | | Seed Resizing Stable Diffusion | Stable Diffusion Pipeline that supports resizing an image and retaining the concepts of the 512 by 512 generation. | [Seed Resizing](#seed-resizing) | - | [Mark Rich](https://github.com/MarkRich) | | Imagic Stable Diffusion | Stable Diffusion Pipeline that enables writing a text prompt to edit an existing image | [Imagic Stable Diffusion](#imagic-stable-diffusion) | - | [Mark Rich](https://github.com/MarkRich) | -| Multilingual Stable Diffusion | Stable Diffusion Pipeline that supports prompts in 50 different languages. | [Multilingual Stable Diffusion](#multilingual-stable-diffusion-pipeline) | - | [Juan Carlos Piñeros](https://github.com/juancopi81) | +| Multilingual Stable Diffusion | Stable Diffusion Pipeline that supports prompts in 50 different languages. | [Multilingual Stable Diffusion](#multilingual-stable-diffusion-pipeline) | [Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/multilingual_stable_diffusion.ipynb) | [Juan Carlos Piñeros](https://github.com/juancopi81) | | GlueGen Stable Diffusion | Stable Diffusion Pipeline that supports prompts in different languages using GlueGen adapter. | [GlueGen Stable Diffusion](#gluegen-stable-diffusion-pipeline) | - | [Phạm Hồng Vinh](https://github.com/rootonchair) | | Image to Image Inpainting Stable Diffusion | Stable Diffusion Pipeline that enables the overlaying of two images and subsequent inpainting | [Image to Image Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion) | - | [Alex McKinney](https://github.com/vvvm23) | | Text Based Inpainting Stable Diffusion | Stable Diffusion Inpainting Pipeline that enables passing a text prompt to generate the mask for inpainting | [Text Based Inpainting Stable Diffusion](#text-based-inpainting-stable-diffusion) | - | [Dhruv Karan](https://github.com/unography) | @@ -41,8 +41,8 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif | DDIM Noise Comparative Analysis Pipeline | Investigating how the diffusion models learn visual concepts from each noise level (which is a contribution of [P2 weighting (CVPR 2022)](https://arxiv.org/abs/2204.00227)) | [DDIM Noise Comparative Analysis Pipeline](#ddim-noise-comparative-analysis-pipeline) | - | [Aengus (Duc-Anh)](https://github.com/aengusng8) | | CLIP Guided Img2Img Stable Diffusion Pipeline | Doing CLIP guidance for image to image generation with Stable Diffusion | [CLIP Guided Img2Img Stable Diffusion](#clip-guided-img2img-stable-diffusion) | - | [Nipun Jindal](https://github.com/nipunjindal/) | | TensorRT Stable Diffusion Text to Image Pipeline | Accelerates the Stable Diffusion Text2Image Pipeline using TensorRT | [TensorRT Stable Diffusion Text to Image Pipeline](#tensorrt-text2image-stable-diffusion-pipeline) | - | [Asfiya Baig](https://github.com/asfiyab-nvidia) | -| EDICT Image Editing Pipeline | Diffusion pipeline for text-guided image editing | [EDICT Image Editing Pipeline](#edict-image-editing-pipeline) | - | [Joqsan Azocar](https://github.com/Joqsan) | -| Stable Diffusion RePaint | Stable Diffusion pipeline using [RePaint](https://arxiv.org/abs/2201.09865) for inpainting. | [Stable Diffusion RePaint](#stable-diffusion-repaint ) | - | [Markus Pobitzer](https://github.com/Markus-Pobitzer) | +| EDICT Image Editing Pipeline | Diffusion pipeline for text-guided image editing | [EDICT Image Editing Pipeline](#edict-image-editing-pipeline) | [Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/edict_image_pipeline.ipynb) | [Joqsan Azocar](https://github.com/Joqsan) | +| Stable Diffusion RePaint | Stable Diffusion pipeline using [RePaint](https://arxiv.org/abs/2201.09865) for inpainting. | [Stable Diffusion RePaint](#stable-diffusion-repaint )|[Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/stable_diffusion_repaint.ipynb)| [Markus Pobitzer](https://github.com/Markus-Pobitzer) | | TensorRT Stable Diffusion Image to Image Pipeline | Accelerates the Stable Diffusion Image2Image Pipeline using TensorRT | [TensorRT Stable Diffusion Image to Image Pipeline](#tensorrt-image2image-stable-diffusion-pipeline) | - | [Asfiya Baig](https://github.com/asfiyab-nvidia) | | Stable Diffusion IPEX Pipeline | Accelerate Stable Diffusion inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [Stable Diffusion on IPEX](#stable-diffusion-on-ipex) | - | [Yingjie Han](https://github.com/yingjie-han/) | | CLIP Guided Images Mixing Stable Diffusion Pipeline | Сombine images using usual diffusion models. | [CLIP Guided Images Mixing Using Stable Diffusion](#clip-guided-images-mixing-with-stable-diffusion) | - | [Karachev Denis](https://github.com/TheDenk) | @@ -251,24 +251,30 @@ Example usage: from diffusers import DiffusionPipeline import torch +model_name = "black-forest-labs/FLUX.1-dev" +prompt = "a watercolor painting of a unicorn" +negative_prompt = "pink" + +# Load the diffusion pipeline pipeline = DiffusionPipeline.from_pretrained( - "black-forest-labs/FLUX.1-dev", + model_name, torch_dtype=torch.bfloat16, custom_pipeline="pipeline_flux_with_cfg" ) pipeline.enable_model_cpu_offload() -prompt = "a watercolor painting of a unicorn" -negative_prompt = "pink" +# Generate the image img = pipeline( prompt=prompt, negative_prompt=negative_prompt, true_cfg=1.5, guidance_scale=3.5, - num_images_per_prompt=1, generator=torch.manual_seed(0) ).images[0] + +# Save the generated image img.save("cfg_flux.png") +print("Image generated and saved successfully.") ``` ### Differential Diffusion diff --git a/examples/community/README_community_scripts.md b/examples/community/README_community_scripts.md index b7641f73855b..eae50247c9e5 100644 --- a/examples/community/README_community_scripts.md +++ b/examples/community/README_community_scripts.md @@ -6,9 +6,9 @@ If a community script doesn't work as expected, please open an issue and ping th | Example | Description | Code Example | Colab | Author | |:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:| -| Using IP-Adapter with Negative Noise | Using negative noise with IP-adapter to better control the generation (see the [original post](https://github.com/huggingface/diffusers/discussions/7167) on the forum for more details) | [IP-Adapter Negative Noise](#ip-adapter-negative-noise) | https://github.com/huggingface/notebooks/blob/main/diffusers/ip_adapter_negative_noise.ipynb | [Álvaro Somoza](https://github.com/asomoza)| -| Asymmetric Tiling |configure seamless image tiling independently for the X and Y axes | [Asymmetric Tiling](#Asymmetric-Tiling ) |https://github.com/huggingface/notebooks/blob/main/diffusers/asymetric_tiling.ipynb | [alexisrolland](https://github.com/alexisrolland)| -| Prompt Scheduling Callback |Allows changing prompts during a generation | [Prompt Scheduling-Callback](#Prompt-Scheduling-Callback ) |https://github.com/huggingface/notebooks/blob/main/diffusers/prompt_scheduling_callback.ipynb | [hlky](https://github.com/hlky)| +| Using IP-Adapter with Negative Noise | Using negative noise with IP-adapter to better control the generation (see the [original post](https://github.com/huggingface/diffusers/discussions/7167) on the forum for more details) | [IP-Adapter Negative Noise](#ip-adapter-negative-noise) |[Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/ip_adapter_negative_noise.ipynb) | [Álvaro Somoza](https://github.com/asomoza)| +| Asymmetric Tiling |configure seamless image tiling independently for the X and Y axes | [Asymmetric Tiling](#Asymmetric-Tiling ) |[Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/asymetric_tiling.ipynb) | [alexisrolland](https://github.com/alexisrolland)| +| Prompt Scheduling Callback |Allows changing prompts during a generation | [Prompt Scheduling-Callback](#Prompt-Scheduling-Callback ) |[Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/prompt_scheduling_callback.ipynb) | [hlky](https://github.com/hlky)| ## Example usages From 2be66e6aa097ec9006d98e31c41f2e867cf6683a Mon Sep 17 00:00:00 2001 From: Parag Ekbote Date: Tue, 3 Dec 2024 23:53:35 +0530 Subject: [PATCH 21/38] Fix Broken Link in Optimization Docs (#10105) Update broken link. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b99ca828e4d0..afecd64d9521 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,7 @@ Check out the [Quickstart](https://huggingface.co/docs/diffusers/quicktour) to l | [Tutorial](https://huggingface.co/docs/diffusers/tutorials/tutorial_overview) | A basic crash course for learning how to use the library's most important features like using models and schedulers to build your own diffusion system, and training your own diffusion model. | | [Loading](https://huggingface.co/docs/diffusers/using-diffusers/loading_overview) | Guides for how to load and configure all the components (pipelines, models, and schedulers) of the library, as well as how to use different schedulers. | | [Pipelines for inference](https://huggingface.co/docs/diffusers/using-diffusers/pipeline_overview) | Guides for how to use pipelines for different inference tasks, batched generation, controlling generated outputs and randomness, and how to contribute a pipeline to the library. | -| [Optimization](https://huggingface.co/docs/diffusers/optimization/opt_overview) | Guides for how to optimize your diffusion model to run faster and consume less memory. | +| [Optimization](https://huggingface.co/docs/diffusers/optimization/fp16) | Guides for how to optimize your diffusion model to run faster and consume less memory. | | [Training](https://huggingface.co/docs/diffusers/training/overview) | Guides for how to train a diffusion model for different tasks with different training techniques. | ## Contribution From 8ac6de963c1f95dbe17173169e6c866f201a78ab Mon Sep 17 00:00:00 2001 From: StAlKeR7779 Date: Wed, 4 Dec 2024 00:21:37 +0300 Subject: [PATCH 22/38] DPM++ third order fixes (#9104) * Fix wrong output on 3n-1 steps count * Add sde handling to 3 order * make * copies --------- Co-authored-by: hlky --- src/diffusers/__init__.py | 2 +- .../scheduling_dpmsolver_multistep.py | 12 +++++++++- .../scheduling_dpmsolver_multistep_inverse.py | 10 ++++++++ .../scheduling_dpmsolver_singlestep.py | 24 ++++++++++++++++++- 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 6f70a8191629..db46dc1d8801 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -338,8 +338,8 @@ "StableDiffusion3ControlNetPipeline", "StableDiffusion3Img2ImgPipeline", "StableDiffusion3InpaintPipeline", - "StableDiffusion3PAGPipeline", "StableDiffusion3PAGImg2ImgPipeline", + "StableDiffusion3PAGPipeline", "StableDiffusion3Pipeline", "StableDiffusionAdapterPipeline", "StableDiffusionAttendAndExcitePipeline", diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py index 4b21328dccb5..e7704f2ced19 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py @@ -889,6 +889,7 @@ def multistep_dpm_solver_third_order_update( model_output_list: List[torch.Tensor], *args, sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, ) -> torch.Tensor: """ @@ -967,6 +968,15 @@ def multistep_dpm_solver_third_order_update( - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1 - (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2 ) + elif self.config.algorithm_type == "sde-dpmsolver++": + assert noise is not None + x_t = ( + (sigma_t / sigma_s0 * torch.exp(-h)) * sample + + (alpha_t * (1.0 - torch.exp(-2.0 * h))) * D0 + + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1 + + (alpha_t * ((1.0 - torch.exp(-2.0 * h) - 2.0 * h) / (2.0 * h) ** 2 - 0.5)) * D2 + + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise + ) return x_t def index_for_timestep(self, timestep, schedule_timesteps=None): @@ -1073,7 +1083,7 @@ def step( elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second: prev_sample = self.multistep_dpm_solver_second_order_update(self.model_outputs, sample=sample, noise=noise) else: - prev_sample = self.multistep_dpm_solver_third_order_update(self.model_outputs, sample=sample) + prev_sample = self.multistep_dpm_solver_third_order_update(self.model_outputs, sample=sample, noise=noise) if self.lower_order_nums < self.config.solver_order: self.lower_order_nums += 1 diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py index 9f10d39ed40c..2968d0ef7b8e 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py @@ -764,6 +764,7 @@ def multistep_dpm_solver_third_order_update( model_output_list: List[torch.Tensor], *args, sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, ) -> torch.Tensor: """ @@ -842,6 +843,15 @@ def multistep_dpm_solver_third_order_update( - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1 - (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2 ) + elif self.config.algorithm_type == "sde-dpmsolver++": + assert noise is not None + x_t = ( + (sigma_t / sigma_s0 * torch.exp(-h)) * sample + + (alpha_t * (1.0 - torch.exp(-2.0 * h))) * D0 + + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1 + + (alpha_t * ((1.0 - torch.exp(-2.0 * h) - 2.0 * h) / (2.0 * h) ** 2 - 0.5)) * D2 + + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise + ) return x_t def _init_step_index(self, timestep): diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py index 868122971e40..02af15ae5c6a 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py @@ -264,6 +264,10 @@ def get_order_list(self, num_inference_steps: int) -> List[int]: orders = [1, 2] * (steps // 2) elif order == 1: orders = [1] * steps + + if self.config.final_sigmas_type == "zero": + orders[-1] = 1 + return orders @property @@ -812,6 +816,7 @@ def singlestep_dpm_solver_third_order_update( model_output_list: List[torch.Tensor], *args, sample: torch.Tensor = None, + noise: Optional[torch.Tensor] = None, **kwargs, ) -> torch.Tensor: """ @@ -909,6 +914,23 @@ def singlestep_dpm_solver_third_order_update( - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1 - (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2 ) + elif self.config.algorithm_type == "sde-dpmsolver++": + assert noise is not None + if self.config.solver_type == "midpoint": + x_t = ( + (sigma_t / sigma_s2 * torch.exp(-h)) * sample + + (alpha_t * (1.0 - torch.exp(-2.0 * h))) * D0 + + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1_1 + + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise + ) + elif self.config.solver_type == "heun": + x_t = ( + (sigma_t / sigma_s2 * torch.exp(-h)) * sample + + (alpha_t * (1.0 - torch.exp(-2.0 * h))) * D0 + + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1 + + (alpha_t * ((1.0 - torch.exp(-2.0 * h) + (-2.0 * h)) / (-2.0 * h) ** 2 - 0.5)) * D2 + + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise + ) return x_t def singlestep_dpm_solver_update( @@ -970,7 +992,7 @@ def singlestep_dpm_solver_update( elif order == 2: return self.singlestep_dpm_solver_second_order_update(model_output_list, sample=sample, noise=noise) elif order == 3: - return self.singlestep_dpm_solver_third_order_update(model_output_list, sample=sample) + return self.singlestep_dpm_solver_third_order_update(model_output_list, sample=sample, noise=noise) else: raise ValueError(f"Order must be 1, 2, 3, got {order}") From b58f67f2d559b15cb2e0c4d2f1448df4d4183c39 Mon Sep 17 00:00:00 2001 From: aihao <51043929+aihao2000@users.noreply.github.com> Date: Wed, 4 Dec 2024 05:26:47 +0800 Subject: [PATCH 23/38] update (#7067) * add data_dir parameter to load_dataset --------- Co-authored-by: Sayak Paul Co-authored-by: YiYi Xu Co-authored-by: hlky --- examples/controlnet/train_controlnet.py | 4 +--- examples/controlnet/train_controlnet_sdxl.py | 4 +--- examples/text_to_image/train_text_to_image_sdxl.py | 5 +---- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py index a2aa266cdfbc..1ddddd18b6e8 100644 --- a/examples/controlnet/train_controlnet.py +++ b/examples/controlnet/train_controlnet.py @@ -571,9 +571,6 @@ def parse_args(input_args=None): if args.dataset_name is None and args.train_data_dir is None: raise ValueError("Specify either `--dataset_name` or `--train_data_dir`") - if args.dataset_name is not None and args.train_data_dir is not None: - raise ValueError("Specify only one of `--dataset_name` or `--train_data_dir`") - if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1: raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].") @@ -615,6 +612,7 @@ def make_train_dataset(args, tokenizer, accelerator): args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir, + data_dir=args.train_data_dir, ) else: if args.train_data_dir is not None: diff --git a/examples/controlnet/train_controlnet_sdxl.py b/examples/controlnet/train_controlnet_sdxl.py index c034c027cbcd..df4ef0f7ddd6 100644 --- a/examples/controlnet/train_controlnet_sdxl.py +++ b/examples/controlnet/train_controlnet_sdxl.py @@ -598,9 +598,6 @@ def parse_args(input_args=None): if args.dataset_name is None and args.train_data_dir is None: raise ValueError("Specify either `--dataset_name` or `--train_data_dir`") - if args.dataset_name is not None and args.train_data_dir is not None: - raise ValueError("Specify only one of `--dataset_name` or `--train_data_dir`") - if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1: raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].") @@ -642,6 +639,7 @@ def get_train_dataset(args, accelerator): args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir, + data_dir=args.train_data_dir, ) else: if args.train_data_dir is not None: diff --git a/examples/text_to_image/train_text_to_image_sdxl.py b/examples/text_to_image/train_text_to_image_sdxl.py index b34feb6f715c..398e793c045a 100644 --- a/examples/text_to_image/train_text_to_image_sdxl.py +++ b/examples/text_to_image/train_text_to_image_sdxl.py @@ -483,7 +483,6 @@ def parse_args(input_args=None): # Sanity checks if args.dataset_name is None and args.train_data_dir is None: raise ValueError("Need either a dataset name or a training folder.") - if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1: raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].") @@ -824,9 +823,7 @@ def load_model_hook(models, input_dir): if args.dataset_name is not None: # Downloading and loading a dataset from the hub. dataset = load_dataset( - args.dataset_name, - args.dataset_config_name, - cache_dir=args.cache_dir, + args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir, data_dir=args.train_data_dir ) else: data_files = {} From 619b9658e286ed10560a13f80084e286a6d85956 Mon Sep 17 00:00:00 2001 From: lsb Date: Tue, 3 Dec 2024 13:54:32 -0800 Subject: [PATCH 24/38] Avoid compiling a progress bar. (#10098) * Avoid creating a progress bar when it is disabled. This is useful when exporting a pipeline, and allows a compiler to avoid trying to compile away tqdm. * Prevent the PyTorch compiler from compiling progress bars. * Update pipeline_utils.py --- src/diffusers/pipelines/pipeline_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index a4faacb44914..5a4219adcb37 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -1552,6 +1552,7 @@ def numpy_to_pil(images): """ return numpy_to_pil(images) + @torch.compiler.disable def progress_bar(self, iterable=None, total=None): if not hasattr(self, "_progress_bar_config"): self._progress_bar_config = {} From 5effcd3e6461490ab27171d7c576d0ea4909a4a8 Mon Sep 17 00:00:00 2001 From: Anand Kumar <63339285+AnandK27@users.noreply.github.com> Date: Tue, 3 Dec 2024 15:57:52 -0800 Subject: [PATCH 25/38] [Bug fix] "previous_timestep()" in DDPM scheduling compatible with "trailing" and "linspace" options (#9384) * Update scheduling_ddpm.py * fix copies --------- Co-authored-by: YiYi Xu Co-authored-by: hlky --- src/diffusers/schedulers/scheduling_ddpm.py | 8 ++------ src/diffusers/schedulers/scheduling_ddpm_parallel.py | 8 ++------ src/diffusers/schedulers/scheduling_lcm.py | 8 ++------ src/diffusers/schedulers/scheduling_tcd.py | 8 ++------ 4 files changed, 8 insertions(+), 24 deletions(-) diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index 468fdf61a9ef..eb40d79b9f60 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -548,16 +548,12 @@ def __len__(self): return self.config.num_train_timesteps def previous_timestep(self, timestep): - if self.custom_timesteps: + if self.custom_timesteps or self.num_inference_steps: index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0] if index == self.timesteps.shape[0] - 1: prev_t = torch.tensor(-1) else: prev_t = self.timesteps[index + 1] else: - num_inference_steps = ( - self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps - ) - prev_t = timestep - self.config.num_train_timesteps // num_inference_steps - + prev_t = timestep - 1 return prev_t diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py index f377ee6e8c93..20ad7a4c927d 100644 --- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py @@ -639,16 +639,12 @@ def __len__(self): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep def previous_timestep(self, timestep): - if self.custom_timesteps: + if self.custom_timesteps or self.num_inference_steps: index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0] if index == self.timesteps.shape[0] - 1: prev_t = torch.tensor(-1) else: prev_t = self.timesteps[index + 1] else: - num_inference_steps = ( - self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps - ) - prev_t = timestep - self.config.num_train_timesteps // num_inference_steps - + prev_t = timestep - 1 return prev_t diff --git a/src/diffusers/schedulers/scheduling_lcm.py b/src/diffusers/schedulers/scheduling_lcm.py index f1aa09ab1723..686b686f6870 100644 --- a/src/diffusers/schedulers/scheduling_lcm.py +++ b/src/diffusers/schedulers/scheduling_lcm.py @@ -643,16 +643,12 @@ def __len__(self): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep def previous_timestep(self, timestep): - if self.custom_timesteps: + if self.custom_timesteps or self.num_inference_steps: index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0] if index == self.timesteps.shape[0] - 1: prev_t = torch.tensor(-1) else: prev_t = self.timesteps[index + 1] else: - num_inference_steps = ( - self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps - ) - prev_t = timestep - self.config.num_train_timesteps // num_inference_steps - + prev_t = timestep - 1 return prev_t diff --git a/src/diffusers/schedulers/scheduling_tcd.py b/src/diffusers/schedulers/scheduling_tcd.py index 580224404c54..5d60383142a4 100644 --- a/src/diffusers/schedulers/scheduling_tcd.py +++ b/src/diffusers/schedulers/scheduling_tcd.py @@ -680,16 +680,12 @@ def __len__(self): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep def previous_timestep(self, timestep): - if self.custom_timesteps: + if self.custom_timesteps or self.num_inference_steps: index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0] if index == self.timesteps.shape[0] - 1: prev_t = torch.tensor(-1) else: prev_t = self.timesteps[index + 1] else: - num_inference_steps = ( - self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps - ) - prev_t = timestep - self.config.num_train_timesteps // num_inference_steps - + prev_t = timestep - 1 return prev_t From 6a51427b6a226591ccc40249721c486855f53e1c Mon Sep 17 00:00:00 2001 From: hlky Date: Tue, 3 Dec 2024 23:58:31 +0000 Subject: [PATCH 26/38] Fix multi-prompt inference (#10103) * Fix multi-prompt inference Fix generation of multiple images with multiple prompts, e.g len(prompts)>1, num_images_per_prompt>1 * make * fix copies --------- Co-authored-by: Nikita Balabin --- .../pipelines/allegro/pipeline_allegro.py | 19 ++++++------------- .../pag/pipeline_pag_pixart_sigma.py | 19 ++++++------------- .../pixart_alpha/pipeline_pixart_alpha.py | 19 ++++++------------- .../pixart_alpha/pipeline_pixart_sigma.py | 19 ++++++------------- 4 files changed, 24 insertions(+), 52 deletions(-) diff --git a/src/diffusers/pipelines/allegro/pipeline_allegro.py b/src/diffusers/pipelines/allegro/pipeline_allegro.py index 9314960f9618..9d6c650fc88d 100644 --- a/src/diffusers/pipelines/allegro/pipeline_allegro.py +++ b/src/diffusers/pipelines/allegro/pipeline_allegro.py @@ -251,13 +251,6 @@ def encode_prompt( if device is None: device = self._execution_device - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - # See Section 3.1. of the paper. max_length = max_sequence_length @@ -302,12 +295,12 @@ def encode_prompt( # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1) prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, seq_len, -1) - prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1) - prompt_attention_mask = prompt_attention_mask.repeat(num_videos_per_prompt, 1) + prompt_attention_mask = prompt_attention_mask.repeat(1, num_videos_per_prompt) + prompt_attention_mask = prompt_attention_mask.view(bs_embed * num_videos_per_prompt, -1) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: - uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt + uncond_tokens = [negative_prompt] * bs_embed if isinstance(negative_prompt, str) else negative_prompt uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -334,10 +327,10 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device) negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1) - negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1) + negative_prompt_embeds = negative_prompt_embeds.view(bs_embed * num_videos_per_prompt, seq_len, -1) - negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1) - negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_videos_per_prompt, 1) + negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(1, num_videos_per_prompt) + negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed * num_videos_per_prompt, -1) else: negative_prompt_embeds = None negative_prompt_attention_mask = None diff --git a/src/diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py b/src/diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py index 59d6a9001e1f..b2fbdd683e86 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_pixart_sigma.py @@ -227,13 +227,6 @@ def encode_prompt( if device is None: device = self._execution_device - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - # See Section 3.1. of the paper. max_length = max_sequence_length @@ -278,12 +271,12 @@ def encode_prompt( # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) - prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1) - prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1) + prompt_attention_mask = prompt_attention_mask.repeat(1, num_images_per_prompt) + prompt_attention_mask = prompt_attention_mask.view(bs_embed * num_images_per_prompt, -1) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: - uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt + uncond_tokens = [negative_prompt] * bs_embed if isinstance(negative_prompt, str) else negative_prompt uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -310,10 +303,10 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device) negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) - negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + negative_prompt_embeds = negative_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) - negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1) - negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1) + negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(1, num_images_per_prompt) + negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed * num_images_per_prompt, -1) else: negative_prompt_embeds = None negative_prompt_attention_mask = None diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py index 46d8ad5e6dfa..391b831166d2 100644 --- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py @@ -338,13 +338,6 @@ def encode_prompt( if device is None: device = self._execution_device - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - # See Section 3.1. of the paper. max_length = max_sequence_length @@ -389,12 +382,12 @@ def encode_prompt( # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) - prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1) - prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1) + prompt_attention_mask = prompt_attention_mask.repeat(1, num_images_per_prompt) + prompt_attention_mask = prompt_attention_mask.view(bs_embed * num_images_per_prompt, -1) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: - uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt + uncond_tokens = [negative_prompt] * bs_embed if isinstance(negative_prompt, str) else negative_prompt uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -421,10 +414,10 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device) negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) - negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + negative_prompt_embeds = negative_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) - negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1) - negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1) + negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(1, num_images_per_prompt) + negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed * num_images_per_prompt, -1) else: negative_prompt_embeds = None negative_prompt_attention_mask = None diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py index b2772d552514..64e1e5bae06c 100644 --- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py @@ -264,13 +264,6 @@ def encode_prompt( if device is None: device = self._execution_device - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - # See Section 3.1. of the paper. max_length = max_sequence_length @@ -315,12 +308,12 @@ def encode_prompt( # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) - prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1) - prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1) + prompt_attention_mask = prompt_attention_mask.repeat(1, num_images_per_prompt) + prompt_attention_mask = prompt_attention_mask.view(bs_embed * num_images_per_prompt, -1) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: - uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt + uncond_tokens = [negative_prompt] * bs_embed if isinstance(negative_prompt, str) else negative_prompt uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -347,10 +340,10 @@ def encode_prompt( negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device) negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) - negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + negative_prompt_embeds = negative_prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) - negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1) - negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1) + negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(1, num_images_per_prompt) + negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed * num_images_per_prompt, -1) else: negative_prompt_embeds = None negative_prompt_attention_mask = None From cfdeebd4a8f0decc3d0e1f0f05a7112ddd1e0a29 Mon Sep 17 00:00:00 2001 From: hlky Date: Wed, 4 Dec 2024 00:28:31 +0000 Subject: [PATCH 27/38] Test `skip_guidance_layers` in SD3 pipeline (#10102) * Test `skip_guidance_layers` in pipelines * Move to test_pipeline_stable_diffusion_3 --- .../test_pipeline_stable_diffusion_3.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py index 7767c94c4879..07ce5487f256 100644 --- a/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py +++ b/tests/pipelines/stable_diffusion_3/test_pipeline_stable_diffusion_3.py @@ -225,6 +225,39 @@ def test_fused_qkv_projections(self): original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2 ), "Original outputs should match when fused QKV projections are disabled." + def test_skip_guidance_layers(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(torch_device) + + output_full = pipe(**inputs)[0] + + inputs_with_skip = inputs.copy() + inputs_with_skip["skip_guidance_layers"] = [0] + output_skip = pipe(**inputs_with_skip)[0] + + self.assertFalse( + np.allclose(output_full, output_skip, atol=1e-5), "Outputs should differ when layers are skipped" + ) + + self.assertEqual(output_full.shape, output_skip.shape, "Outputs should have the same shape") + + inputs["num_images_per_prompt"] = 2 + output_full = pipe(**inputs)[0] + + inputs_with_skip = inputs.copy() + inputs_with_skip["skip_guidance_layers"] = [0] + output_skip = pipe(**inputs_with_skip)[0] + + self.assertFalse( + np.allclose(output_full, output_skip, atol=1e-5), "Outputs should differ when layers are skipped" + ) + + self.assertEqual(output_full.shape, output_skip.shape, "Outputs should have the same shape") + @slow @require_big_gpu_with_torch_cuda From 8421c1461bf4ab7801070d04d6ec1e6b28ee5b59 Mon Sep 17 00:00:00 2001 From: Ivan Skorokhodov Date: Tue, 3 Dec 2024 23:20:11 -0800 Subject: [PATCH 28/38] Use parameters + buffers when deciding upscale_dtype (#9882) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sometimes, the decoder might lack parameters and only buffers (e.g., this happens when we manually need to convert all the parameters to buffers — e.g. to avoid packing fp16 and fp32 parameters with FSDP) --- .../models/autoencoders/autoencoder_kl_temporal_decoder.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py index 4e3902ae6dbe..f25430050ce5 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import itertools from typing import Dict, Optional, Tuple, Union import torch @@ -94,7 +95,7 @@ def forward( sample = self.conv_in(sample) - upscale_dtype = next(iter(self.up_blocks.parameters())).dtype + upscale_dtype = next(itertools.chain(self.up_blocks.parameters(), self.up_blocks.buffers())).dtype if torch.is_grad_enabled() and self.gradient_checkpointing: def create_custom_forward(module): From c1926cef6b2c880766db3581ed6035c99005f00e Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 4 Dec 2024 15:58:36 +0530 Subject: [PATCH 29/38] [tests] refactor vae tests (#9808) * add: autoencoderkl tests * autoencodertiny. * fix * asymmetric autoencoder. * more * integration tests for stable audio decoder. * consistency decoder vae tests * remove grad check from consistency decoder. * cog * bye test_models_vae.py * fix * fix * remove allegro * fixes * fixes * fixes --------- Co-authored-by: Dhruv Nair --- .../autoencoders/autoencoder_kl_cogvideox.py | 20 +- .../autoencoder_kl_temporal_decoder.py | 8 - .../models/autoencoders/autoencoder_tiny.py | 6 +- .../test_models_asymmetric_autoencoder_kl.py | 261 ++++ .../test_models_autoencoder_kl.py | 468 ++++++ .../test_models_autoencoder_kl_cogvideox.py | 179 +++ ..._models_autoencoder_kl_temporal_decoder.py | 73 + .../test_models_autoencoder_oobleck.py | 228 +++ .../test_models_autoencoder_tiny.py | 251 ++++ .../test_models_consistency_decoder_vae.py | 300 ++++ tests/models/autoencoders/test_models_vae.py | 1249 ----------------- tests/models/autoencoders/vae.py | 86 ++ tests/models/test_modeling_common.py | 5 - .../controlnet_xs/test_controlnetxs.py | 2 +- .../controlnet_xs/test_controlnetxs_sdxl.py | 2 +- tests/pipelines/test_pipelines_common.py | 2 +- 16 files changed, 1863 insertions(+), 1277 deletions(-) create mode 100644 tests/models/autoencoders/test_models_asymmetric_autoencoder_kl.py create mode 100644 tests/models/autoencoders/test_models_autoencoder_kl.py create mode 100644 tests/models/autoencoders/test_models_autoencoder_kl_cogvideox.py create mode 100644 tests/models/autoencoders/test_models_autoencoder_kl_temporal_decoder.py create mode 100644 tests/models/autoencoders/test_models_autoencoder_oobleck.py create mode 100644 tests/models/autoencoders/test_models_autoencoder_tiny.py create mode 100644 tests/models/autoencoders/test_models_consistency_decoder_vae.py delete mode 100644 tests/models/autoencoders/test_models_vae.py create mode 100644 tests/models/autoencoders/vae.py diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py index fbcb964392f9..941b3eb07f10 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py @@ -433,7 +433,7 @@ def create_forward(*inputs): hidden_states, temb, zq, - conv_cache=conv_cache.get(conv_cache_key), + conv_cache.get(conv_cache_key), ) else: hidden_states, new_conv_cache[conv_cache_key] = resnet( @@ -531,7 +531,7 @@ def create_forward(*inputs): return create_forward hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint( - create_custom_forward(resnet), hidden_states, temb, zq, conv_cache=conv_cache.get(conv_cache_key) + create_custom_forward(resnet), hidden_states, temb, zq, conv_cache.get(conv_cache_key) ) else: hidden_states, new_conv_cache[conv_cache_key] = resnet( @@ -649,7 +649,7 @@ def create_forward(*inputs): hidden_states, temb, zq, - conv_cache=conv_cache.get(conv_cache_key), + conv_cache.get(conv_cache_key), ) else: hidden_states, new_conv_cache[conv_cache_key] = resnet( @@ -789,7 +789,7 @@ def custom_forward(*inputs): hidden_states, temb, None, - conv_cache=conv_cache.get(conv_cache_key), + conv_cache.get(conv_cache_key), ) # 2. Mid @@ -798,14 +798,14 @@ def custom_forward(*inputs): hidden_states, temb, None, - conv_cache=conv_cache.get("mid_block"), + conv_cache.get("mid_block"), ) else: # 1. Down for i, down_block in enumerate(self.down_blocks): conv_cache_key = f"down_block_{i}" hidden_states, new_conv_cache[conv_cache_key] = down_block( - hidden_states, temb, None, conv_cache=conv_cache.get(conv_cache_key) + hidden_states, temb, None, conv_cache.get(conv_cache_key) ) # 2. Mid @@ -953,7 +953,7 @@ def custom_forward(*inputs): hidden_states, temb, sample, - conv_cache=conv_cache.get("mid_block"), + conv_cache.get("mid_block"), ) # 2. Up @@ -964,7 +964,7 @@ def custom_forward(*inputs): hidden_states, temb, sample, - conv_cache=conv_cache.get(conv_cache_key), + conv_cache.get(conv_cache_key), ) else: # 1. Mid @@ -1476,7 +1476,7 @@ def forward( z = posterior.sample(generator=generator) else: z = posterior.mode() - dec = self.decode(z) + dec = self.decode(z).sample if not return_dict: return (dec,) - return dec + return DecoderOutput(sample=dec) diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py index f25430050ce5..38ad78c0707b 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py @@ -229,14 +229,6 @@ def __init__( self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1) - sample_size = ( - self.config.sample_size[0] - if isinstance(self.config.sample_size, (list, tuple)) - else self.config.sample_size - ) - self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1))) - self.tile_overlap_factor = 0.25 - def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, (Encoder, TemporalDecoder)): module.gradient_checkpointing = value diff --git a/src/diffusers/models/autoencoders/autoencoder_tiny.py b/src/diffusers/models/autoencoders/autoencoder_tiny.py index 6e503478fe2b..35081c22dfc4 100644 --- a/src/diffusers/models/autoencoders/autoencoder_tiny.py +++ b/src/diffusers/models/autoencoders/autoencoder_tiny.py @@ -310,7 +310,9 @@ def decode( self, x: torch.Tensor, generator: Optional[torch.Generator] = None, return_dict: bool = True ) -> Union[DecoderOutput, Tuple[torch.Tensor]]: if self.use_slicing and x.shape[0] > 1: - output = [self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x) for x_slice in x.split(1)] + output = [ + self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x_slice) for x_slice in x.split(1) + ] output = torch.cat(output) else: output = self._tiled_decode(x) if self.use_tiling else self.decoder(x) @@ -341,7 +343,7 @@ def forward( # as if we were loading the latents from an RGBA uint8 image. unscaled_enc = self.unscale_latents(scaled_enc / 255.0) - dec = self.decode(unscaled_enc) + dec = self.decode(unscaled_enc).sample if not return_dict: return (dec,) diff --git a/tests/models/autoencoders/test_models_asymmetric_autoencoder_kl.py b/tests/models/autoencoders/test_models_asymmetric_autoencoder_kl.py new file mode 100644 index 000000000000..11b93ac2fb45 --- /dev/null +++ b/tests/models/autoencoders/test_models_asymmetric_autoencoder_kl.py @@ -0,0 +1,261 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import unittest + +import torch +from parameterized import parameterized + +from diffusers import AsymmetricAutoencoderKL +from diffusers.utils.import_utils import is_xformers_available +from diffusers.utils.testing_utils import ( + backend_empty_cache, + enable_full_determinism, + floats_tensor, + load_hf_numpy, + require_torch_accelerator, + require_torch_gpu, + skip_mps, + slow, + torch_all_close, + torch_device, +) + +from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin + + +enable_full_determinism() + + +class AutoencoderKLTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): + model_class = AsymmetricAutoencoderKL + main_input_name = "sample" + base_precision = 1e-2 + + def get_asym_autoencoder_kl_config(self, block_out_channels=None, norm_num_groups=None): + block_out_channels = block_out_channels or [2, 4] + norm_num_groups = norm_num_groups or 2 + init_dict = { + "in_channels": 3, + "out_channels": 3, + "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels), + "down_block_out_channels": block_out_channels, + "layers_per_down_block": 1, + "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels), + "up_block_out_channels": block_out_channels, + "layers_per_up_block": 1, + "act_fn": "silu", + "latent_channels": 4, + "norm_num_groups": norm_num_groups, + "sample_size": 32, + "scaling_factor": 0.18215, + } + return init_dict + + @property + def dummy_input(self): + batch_size = 4 + num_channels = 3 + sizes = (32, 32) + + image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device) + mask = torch.ones((batch_size, 1) + sizes).to(torch_device) + + return {"sample": image, "mask": mask} + + @property + def input_shape(self): + return (3, 32, 32) + + @property + def output_shape(self): + return (3, 32, 32) + + def prepare_init_args_and_inputs_for_common(self): + init_dict = self.get_asym_autoencoder_kl_config() + inputs_dict = self.dummy_input + return init_dict, inputs_dict + + @unittest.skip("Unsupported test.") + def test_forward_with_norm_groups(self): + pass + + +@slow +class AsymmetricAutoencoderKLIntegrationTests(unittest.TestCase): + def get_file_format(self, seed, shape): + return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy" + + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + backend_empty_cache(torch_device) + + def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False): + dtype = torch.float16 if fp16 else torch.float32 + image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype) + return image + + def get_sd_vae_model(self, model_id="cross-attention/asymmetric-autoencoder-kl-x-1-5", fp16=False): + revision = "main" + torch_dtype = torch.float32 + + model = AsymmetricAutoencoderKL.from_pretrained( + model_id, + torch_dtype=torch_dtype, + revision=revision, + ) + model.to(torch_device).eval() + + return model + + def get_generator(self, seed=0): + generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda" + if torch_device != "mps": + return torch.Generator(device=generator_device).manual_seed(seed) + return torch.manual_seed(seed) + + @parameterized.expand( + [ + # fmt: off + [ + 33, + [-0.0336, 0.3011, 0.1764, 0.0087, -0.3401, 0.3645, -0.1247, 0.1205], + [-0.1603, 0.9878, -0.0495, -0.0790, -0.2709, 0.8375, -0.2060, -0.0824], + ], + [ + 47, + [0.4400, 0.0543, 0.2873, 0.2946, 0.0553, 0.0839, -0.1585, 0.2529], + [-0.2376, 0.1168, 0.1332, -0.4840, -0.2508, -0.0791, -0.0493, -0.4089], + ], + # fmt: on + ] + ) + def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps): + model = self.get_sd_vae_model() + image = self.get_sd_image(seed) + generator = self.get_generator(seed) + + with torch.no_grad(): + sample = model(image, generator=generator, sample_posterior=True).sample + + assert sample.shape == image.shape + + output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() + expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice) + + assert torch_all_close(output_slice, expected_output_slice, atol=5e-3) + + @parameterized.expand( + [ + # fmt: off + [ + 33, + [-0.0340, 0.2870, 0.1698, -0.0105, -0.3448, 0.3529, -0.1321, 0.1097], + [-0.0344, 0.2912, 0.1687, -0.0137, -0.3462, 0.3552, -0.1337, 0.1078], + ], + [ + 47, + [0.4397, 0.0550, 0.2873, 0.2946, 0.0567, 0.0855, -0.1580, 0.2531], + [0.4397, 0.0550, 0.2873, 0.2946, 0.0567, 0.0855, -0.1580, 0.2531], + ], + # fmt: on + ] + ) + def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps): + model = self.get_sd_vae_model() + image = self.get_sd_image(seed) + + with torch.no_grad(): + sample = model(image).sample + + assert sample.shape == image.shape + + output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() + expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice) + + assert torch_all_close(output_slice, expected_output_slice, atol=3e-3) + + @parameterized.expand( + [ + # fmt: off + [13, [-0.0521, -0.2939, 0.1540, -0.1855, -0.5936, -0.3138, -0.4579, -0.2275]], + [37, [-0.1820, -0.4345, -0.0455, -0.2923, -0.8035, -0.5089, -0.4795, -0.3106]], + # fmt: on + ] + ) + @require_torch_accelerator + @skip_mps + def test_stable_diffusion_decode(self, seed, expected_slice): + model = self.get_sd_vae_model() + encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64)) + + with torch.no_grad(): + sample = model.decode(encoding).sample + + assert list(sample.shape) == [3, 3, 512, 512] + + output_slice = sample[-1, -2:, :2, -2:].flatten().cpu() + expected_output_slice = torch.tensor(expected_slice) + + assert torch_all_close(output_slice, expected_output_slice, atol=2e-3) + + @parameterized.expand([(13,), (16,), (37,)]) + @require_torch_gpu + @unittest.skipIf( + not is_xformers_available(), + reason="xformers is not required when using PyTorch 2.0.", + ) + def test_stable_diffusion_decode_xformers_vs_2_0(self, seed): + model = self.get_sd_vae_model() + encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64)) + + with torch.no_grad(): + sample = model.decode(encoding).sample + + model.enable_xformers_memory_efficient_attention() + with torch.no_grad(): + sample_2 = model.decode(encoding).sample + + assert list(sample.shape) == [3, 3, 512, 512] + + assert torch_all_close(sample, sample_2, atol=5e-2) + + @parameterized.expand( + [ + # fmt: off + [33, [-0.3001, 0.0918, -2.6984, -3.9720, -3.2099, -5.0353, 1.7338, -0.2065, 3.4267]], + [47, [-1.5030, -4.3871, -6.0355, -9.1157, -1.6661, -2.7853, 2.1607, -5.0823, 2.5633]], + # fmt: on + ] + ) + def test_stable_diffusion_encode_sample(self, seed, expected_slice): + model = self.get_sd_vae_model() + image = self.get_sd_image(seed) + generator = self.get_generator(seed) + + with torch.no_grad(): + dist = model.encode(image).latent_dist + sample = dist.sample(generator=generator) + + assert list(sample.shape) == [image.shape[0], 4] + [i // 8 for i in image.shape[2:]] + + output_slice = sample[0, -1, -3:, -3:].flatten().cpu() + expected_output_slice = torch.tensor(expected_slice) + + tolerance = 3e-3 if torch_device != "mps" else 1e-2 + assert torch_all_close(output_slice, expected_output_slice, atol=tolerance) diff --git a/tests/models/autoencoders/test_models_autoencoder_kl.py b/tests/models/autoencoders/test_models_autoencoder_kl.py new file mode 100644 index 000000000000..52bf5aba204b --- /dev/null +++ b/tests/models/autoencoders/test_models_autoencoder_kl.py @@ -0,0 +1,468 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import unittest + +import torch +from parameterized import parameterized + +from diffusers import AutoencoderKL +from diffusers.utils.import_utils import is_xformers_available +from diffusers.utils.testing_utils import ( + backend_empty_cache, + enable_full_determinism, + floats_tensor, + load_hf_numpy, + require_torch_accelerator, + require_torch_accelerator_with_fp16, + require_torch_gpu, + skip_mps, + slow, + torch_all_close, + torch_device, +) + +from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin + + +enable_full_determinism() + + +class AutoencoderKLTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): + model_class = AutoencoderKL + main_input_name = "sample" + base_precision = 1e-2 + + def get_autoencoder_kl_config(self, block_out_channels=None, norm_num_groups=None): + block_out_channels = block_out_channels or [2, 4] + norm_num_groups = norm_num_groups or 2 + init_dict = { + "block_out_channels": block_out_channels, + "in_channels": 3, + "out_channels": 3, + "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels), + "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels), + "latent_channels": 4, + "norm_num_groups": norm_num_groups, + } + return init_dict + + @property + def dummy_input(self): + batch_size = 4 + num_channels = 3 + sizes = (32, 32) + + image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device) + + return {"sample": image} + + @property + def input_shape(self): + return (3, 32, 32) + + @property + def output_shape(self): + return (3, 32, 32) + + def prepare_init_args_and_inputs_for_common(self): + init_dict = self.get_autoencoder_kl_config() + inputs_dict = self.dummy_input + return init_dict, inputs_dict + + def test_enable_disable_tiling(self): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + torch.manual_seed(0) + model = self.model_class(**init_dict).to(torch_device) + + inputs_dict.update({"return_dict": False}) + + torch.manual_seed(0) + output_without_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + torch.manual_seed(0) + model.enable_tiling() + output_with_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertLess( + (output_without_tiling.detach().cpu().numpy() - output_with_tiling.detach().cpu().numpy()).max(), + 0.5, + "VAE tiling should not affect the inference results", + ) + + torch.manual_seed(0) + model.disable_tiling() + output_without_tiling_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertEqual( + output_without_tiling.detach().cpu().numpy().all(), + output_without_tiling_2.detach().cpu().numpy().all(), + "Without tiling outputs should match with the outputs when tiling is manually disabled.", + ) + + def test_enable_disable_slicing(self): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + torch.manual_seed(0) + model = self.model_class(**init_dict).to(torch_device) + + inputs_dict.update({"return_dict": False}) + + torch.manual_seed(0) + output_without_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + torch.manual_seed(0) + model.enable_slicing() + output_with_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertLess( + (output_without_slicing.detach().cpu().numpy() - output_with_slicing.detach().cpu().numpy()).max(), + 0.5, + "VAE slicing should not affect the inference results", + ) + + torch.manual_seed(0) + model.disable_slicing() + output_without_slicing_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertEqual( + output_without_slicing.detach().cpu().numpy().all(), + output_without_slicing_2.detach().cpu().numpy().all(), + "Without slicing outputs should match with the outputs when slicing is manually disabled.", + ) + + def test_gradient_checkpointing_is_applied(self): + expected_set = {"Decoder", "Encoder"} + super().test_gradient_checkpointing_is_applied(expected_set=expected_set) + + def test_from_pretrained_hub(self): + model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True) + self.assertIsNotNone(model) + self.assertEqual(len(loading_info["missing_keys"]), 0) + + model.to(torch_device) + image = model(**self.dummy_input) + + assert image is not None, "Make sure output is not None" + + def test_output_pretrained(self): + model = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy") + model = model.to(torch_device) + model.eval() + + # Keep generator on CPU for non-CUDA devices to compare outputs with CPU result tensors + generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda" + if torch_device != "mps": + generator = torch.Generator(device=generator_device).manual_seed(0) + else: + generator = torch.manual_seed(0) + + image = torch.randn( + 1, + model.config.in_channels, + model.config.sample_size, + model.config.sample_size, + generator=torch.manual_seed(0), + ) + image = image.to(torch_device) + with torch.no_grad(): + output = model(image, sample_posterior=True, generator=generator).sample + + output_slice = output[0, -1, -3:, -3:].flatten().cpu() + + # Since the VAE Gaussian prior's generator is seeded on the appropriate device, + # the expected output slices are not the same for CPU and GPU. + if torch_device == "mps": + expected_output_slice = torch.tensor( + [ + -4.0078e-01, + -3.8323e-04, + -1.2681e-01, + -1.1462e-01, + 2.0095e-01, + 1.0893e-01, + -8.8247e-02, + -3.0361e-01, + -9.8644e-03, + ] + ) + elif generator_device == "cpu": + expected_output_slice = torch.tensor( + [ + -0.1352, + 0.0878, + 0.0419, + -0.0818, + -0.1069, + 0.0688, + -0.1458, + -0.4446, + -0.0026, + ] + ) + else: + expected_output_slice = torch.tensor( + [ + -0.2421, + 0.4642, + 0.2507, + -0.0438, + 0.0682, + 0.3160, + -0.2018, + -0.0727, + 0.2485, + ] + ) + + self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2)) + + +@slow +class AutoencoderKLIntegrationTests(unittest.TestCase): + def get_file_format(self, seed, shape): + return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy" + + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + backend_empty_cache(torch_device) + + def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False): + dtype = torch.float16 if fp16 else torch.float32 + image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype) + return image + + def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False): + revision = "fp16" if fp16 else None + torch_dtype = torch.float16 if fp16 else torch.float32 + + model = AutoencoderKL.from_pretrained( + model_id, + subfolder="vae", + torch_dtype=torch_dtype, + revision=revision, + ) + model.to(torch_device) + + return model + + def get_generator(self, seed=0): + generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda" + if torch_device != "mps": + return torch.Generator(device=generator_device).manual_seed(seed) + return torch.manual_seed(seed) + + @parameterized.expand( + [ + # fmt: off + [ + 33, + [-0.1556, 0.9848, -0.0410, -0.0642, -0.2685, 0.8381, -0.2004, -0.0700], + [-0.2395, 0.0098, 0.0102, -0.0709, -0.2840, -0.0274, -0.0718, -0.1824], + ], + [ + 47, + [-0.2376, 0.1200, 0.1337, -0.4830, -0.2504, -0.0759, -0.0486, -0.4077], + [0.0350, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131], + ], + # fmt: on + ] + ) + def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps): + model = self.get_sd_vae_model() + image = self.get_sd_image(seed) + generator = self.get_generator(seed) + + with torch.no_grad(): + sample = model(image, generator=generator, sample_posterior=True).sample + + assert sample.shape == image.shape + + output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() + expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice) + + assert torch_all_close(output_slice, expected_output_slice, atol=3e-3) + + @parameterized.expand( + [ + # fmt: off + [33, [-0.0513, 0.0289, 1.3799, 0.2166, -0.2573, -0.0871, 0.5103, -0.0999]], + [47, [-0.4128, -0.1320, -0.3704, 0.1965, -0.4116, -0.2332, -0.3340, 0.2247]], + # fmt: on + ] + ) + @require_torch_accelerator_with_fp16 + def test_stable_diffusion_fp16(self, seed, expected_slice): + model = self.get_sd_vae_model(fp16=True) + image = self.get_sd_image(seed, fp16=True) + generator = self.get_generator(seed) + + with torch.no_grad(): + sample = model(image, generator=generator, sample_posterior=True).sample + + assert sample.shape == image.shape + + output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu() + expected_output_slice = torch.tensor(expected_slice) + + assert torch_all_close(output_slice, expected_output_slice, atol=1e-2) + + @parameterized.expand( + [ + # fmt: off + [ + 33, + [-0.1609, 0.9866, -0.0487, -0.0777, -0.2716, 0.8368, -0.2055, -0.0814], + [-0.2395, 0.0098, 0.0102, -0.0709, -0.2840, -0.0274, -0.0718, -0.1824], + ], + [ + 47, + [-0.2377, 0.1147, 0.1333, -0.4841, -0.2506, -0.0805, -0.0491, -0.4085], + [0.0350, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131], + ], + # fmt: on + ] + ) + def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps): + model = self.get_sd_vae_model() + image = self.get_sd_image(seed) + + with torch.no_grad(): + sample = model(image).sample + + assert sample.shape == image.shape + + output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() + expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice) + + assert torch_all_close(output_slice, expected_output_slice, atol=3e-3) + + @parameterized.expand( + [ + # fmt: off + [13, [-0.2051, -0.1803, -0.2311, -0.2114, -0.3292, -0.3574, -0.2953, -0.3323]], + [37, [-0.2632, -0.2625, -0.2199, -0.2741, -0.4539, -0.4990, -0.3720, -0.4925]], + # fmt: on + ] + ) + @require_torch_accelerator + @skip_mps + def test_stable_diffusion_decode(self, seed, expected_slice): + model = self.get_sd_vae_model() + encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64)) + + with torch.no_grad(): + sample = model.decode(encoding).sample + + assert list(sample.shape) == [3, 3, 512, 512] + + output_slice = sample[-1, -2:, :2, -2:].flatten().cpu() + expected_output_slice = torch.tensor(expected_slice) + + assert torch_all_close(output_slice, expected_output_slice, atol=1e-3) + + @parameterized.expand( + [ + # fmt: off + [27, [-0.0369, 0.0207, -0.0776, -0.0682, -0.1747, -0.1930, -0.1465, -0.2039]], + [16, [-0.1628, -0.2134, -0.2747, -0.2642, -0.3774, -0.4404, -0.3687, -0.4277]], + # fmt: on + ] + ) + @require_torch_accelerator_with_fp16 + def test_stable_diffusion_decode_fp16(self, seed, expected_slice): + model = self.get_sd_vae_model(fp16=True) + encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True) + + with torch.no_grad(): + sample = model.decode(encoding).sample + + assert list(sample.shape) == [3, 3, 512, 512] + + output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu() + expected_output_slice = torch.tensor(expected_slice) + + assert torch_all_close(output_slice, expected_output_slice, atol=5e-3) + + @parameterized.expand([(13,), (16,), (27,)]) + @require_torch_gpu + @unittest.skipIf( + not is_xformers_available(), + reason="xformers is not required when using PyTorch 2.0.", + ) + def test_stable_diffusion_decode_xformers_vs_2_0_fp16(self, seed): + model = self.get_sd_vae_model(fp16=True) + encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True) + + with torch.no_grad(): + sample = model.decode(encoding).sample + + model.enable_xformers_memory_efficient_attention() + with torch.no_grad(): + sample_2 = model.decode(encoding).sample + + assert list(sample.shape) == [3, 3, 512, 512] + + assert torch_all_close(sample, sample_2, atol=1e-1) + + @parameterized.expand([(13,), (16,), (37,)]) + @require_torch_gpu + @unittest.skipIf( + not is_xformers_available(), + reason="xformers is not required when using PyTorch 2.0.", + ) + def test_stable_diffusion_decode_xformers_vs_2_0(self, seed): + model = self.get_sd_vae_model() + encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64)) + + with torch.no_grad(): + sample = model.decode(encoding).sample + + model.enable_xformers_memory_efficient_attention() + with torch.no_grad(): + sample_2 = model.decode(encoding).sample + + assert list(sample.shape) == [3, 3, 512, 512] + + assert torch_all_close(sample, sample_2, atol=1e-2) + + @parameterized.expand( + [ + # fmt: off + [33, [-0.3001, 0.0918, -2.6984, -3.9720, -3.2099, -5.0353, 1.7338, -0.2065, 3.4267]], + [47, [-1.5030, -4.3871, -6.0355, -9.1157, -1.6661, -2.7853, 2.1607, -5.0823, 2.5633]], + # fmt: on + ] + ) + def test_stable_diffusion_encode_sample(self, seed, expected_slice): + model = self.get_sd_vae_model() + image = self.get_sd_image(seed) + generator = self.get_generator(seed) + + with torch.no_grad(): + dist = model.encode(image).latent_dist + sample = dist.sample(generator=generator) + + assert list(sample.shape) == [image.shape[0], 4] + [i // 8 for i in image.shape[2:]] + + output_slice = sample[0, -1, -3:, -3:].flatten().cpu() + expected_output_slice = torch.tensor(expected_slice) + + tolerance = 3e-3 if torch_device != "mps" else 1e-2 + assert torch_all_close(output_slice, expected_output_slice, atol=tolerance) diff --git a/tests/models/autoencoders/test_models_autoencoder_kl_cogvideox.py b/tests/models/autoencoders/test_models_autoencoder_kl_cogvideox.py new file mode 100644 index 000000000000..7336bb3d3e97 --- /dev/null +++ b/tests/models/autoencoders/test_models_autoencoder_kl_cogvideox.py @@ -0,0 +1,179 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import torch + +from diffusers import AutoencoderKLCogVideoX +from diffusers.utils.testing_utils import ( + enable_full_determinism, + floats_tensor, + torch_device, +) + +from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin + + +enable_full_determinism() + + +class AutoencoderKLCogVideoXTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): + model_class = AutoencoderKLCogVideoX + main_input_name = "sample" + base_precision = 1e-2 + + def get_autoencoder_kl_cogvideox_config(self): + return { + "in_channels": 3, + "out_channels": 3, + "down_block_types": ( + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + "CogVideoXDownBlock3D", + ), + "up_block_types": ( + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + "CogVideoXUpBlock3D", + ), + "block_out_channels": (8, 8, 8, 8), + "latent_channels": 4, + "layers_per_block": 1, + "norm_num_groups": 2, + "temporal_compression_ratio": 4, + } + + @property + def dummy_input(self): + batch_size = 4 + num_frames = 8 + num_channels = 3 + sizes = (16, 16) + + image = floats_tensor((batch_size, num_channels, num_frames) + sizes).to(torch_device) + + return {"sample": image} + + @property + def input_shape(self): + return (3, 8, 16, 16) + + @property + def output_shape(self): + return (3, 8, 16, 16) + + def prepare_init_args_and_inputs_for_common(self): + init_dict = self.get_autoencoder_kl_cogvideox_config() + inputs_dict = self.dummy_input + return init_dict, inputs_dict + + def test_enable_disable_tiling(self): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + torch.manual_seed(0) + model = self.model_class(**init_dict).to(torch_device) + + inputs_dict.update({"return_dict": False}) + + torch.manual_seed(0) + output_without_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + torch.manual_seed(0) + model.enable_tiling() + output_with_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertLess( + (output_without_tiling.detach().cpu().numpy() - output_with_tiling.detach().cpu().numpy()).max(), + 0.5, + "VAE tiling should not affect the inference results", + ) + + torch.manual_seed(0) + model.disable_tiling() + output_without_tiling_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertEqual( + output_without_tiling.detach().cpu().numpy().all(), + output_without_tiling_2.detach().cpu().numpy().all(), + "Without tiling outputs should match with the outputs when tiling is manually disabled.", + ) + + def test_enable_disable_slicing(self): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + torch.manual_seed(0) + model = self.model_class(**init_dict).to(torch_device) + + inputs_dict.update({"return_dict": False}) + + torch.manual_seed(0) + output_without_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + torch.manual_seed(0) + model.enable_slicing() + output_with_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertLess( + (output_without_slicing.detach().cpu().numpy() - output_with_slicing.detach().cpu().numpy()).max(), + 0.5, + "VAE slicing should not affect the inference results", + ) + + torch.manual_seed(0) + model.disable_slicing() + output_without_slicing_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertEqual( + output_without_slicing.detach().cpu().numpy().all(), + output_without_slicing_2.detach().cpu().numpy().all(), + "Without slicing outputs should match with the outputs when slicing is manually disabled.", + ) + + def test_gradient_checkpointing_is_applied(self): + expected_set = { + "CogVideoXDownBlock3D", + "CogVideoXDecoder3D", + "CogVideoXEncoder3D", + "CogVideoXUpBlock3D", + "CogVideoXMidBlock3D", + } + super().test_gradient_checkpointing_is_applied(expected_set=expected_set) + + def test_forward_with_norm_groups(self): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + init_dict["norm_num_groups"] = 16 + init_dict["block_out_channels"] = (16, 32, 32, 32) + + model = self.model_class(**init_dict) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + output = model(**inputs_dict) + + if isinstance(output, dict): + output = output.to_tuple()[0] + + self.assertIsNotNone(output) + expected_shape = inputs_dict["sample"].shape + self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") + + @unittest.skip("Unsupported test.") + def test_outputs_equivalence(self): + pass diff --git a/tests/models/autoencoders/test_models_autoencoder_kl_temporal_decoder.py b/tests/models/autoencoders/test_models_autoencoder_kl_temporal_decoder.py new file mode 100644 index 000000000000..4308cb64896e --- /dev/null +++ b/tests/models/autoencoders/test_models_autoencoder_kl_temporal_decoder.py @@ -0,0 +1,73 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from diffusers import AutoencoderKLTemporalDecoder +from diffusers.utils.testing_utils import ( + enable_full_determinism, + floats_tensor, + torch_device, +) + +from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin + + +enable_full_determinism() + + +class AutoencoderKLTemporalDecoderTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): + model_class = AutoencoderKLTemporalDecoder + main_input_name = "sample" + base_precision = 1e-2 + + @property + def dummy_input(self): + batch_size = 3 + num_channels = 3 + sizes = (32, 32) + + image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device) + num_frames = 3 + + return {"sample": image, "num_frames": num_frames} + + @property + def input_shape(self): + return (3, 32, 32) + + @property + def output_shape(self): + return (3, 32, 32) + + def prepare_init_args_and_inputs_for_common(self): + init_dict = { + "block_out_channels": [32, 64], + "in_channels": 3, + "out_channels": 3, + "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"], + "latent_channels": 4, + "layers_per_block": 2, + } + inputs_dict = self.dummy_input + return init_dict, inputs_dict + + def test_gradient_checkpointing_is_applied(self): + expected_set = {"Encoder", "TemporalDecoder"} + super().test_gradient_checkpointing_is_applied(expected_set=expected_set) + + @unittest.skip("Test unsupported.") + def test_forward_with_norm_groups(self): + pass diff --git a/tests/models/autoencoders/test_models_autoencoder_oobleck.py b/tests/models/autoencoders/test_models_autoencoder_oobleck.py new file mode 100644 index 000000000000..4807fa298344 --- /dev/null +++ b/tests/models/autoencoders/test_models_autoencoder_oobleck.py @@ -0,0 +1,228 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import unittest + +import torch +from datasets import load_dataset +from parameterized import parameterized + +from diffusers import AutoencoderOobleck +from diffusers.utils.testing_utils import ( + backend_empty_cache, + enable_full_determinism, + floats_tensor, + slow, + torch_all_close, + torch_device, +) + +from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin + + +enable_full_determinism() + + +class AutoencoderOobleckTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): + model_class = AutoencoderOobleck + main_input_name = "sample" + base_precision = 1e-2 + + def get_autoencoder_oobleck_config(self, block_out_channels=None): + init_dict = { + "encoder_hidden_size": 12, + "decoder_channels": 12, + "decoder_input_channels": 6, + "audio_channels": 2, + "downsampling_ratios": [2, 4], + "channel_multiples": [1, 2], + } + return init_dict + + @property + def dummy_input(self): + batch_size = 4 + num_channels = 2 + seq_len = 24 + + waveform = floats_tensor((batch_size, num_channels, seq_len)).to(torch_device) + + return {"sample": waveform, "sample_posterior": False} + + @property + def input_shape(self): + return (2, 24) + + @property + def output_shape(self): + return (2, 24) + + def prepare_init_args_and_inputs_for_common(self): + init_dict = self.get_autoencoder_oobleck_config() + inputs_dict = self.dummy_input + return init_dict, inputs_dict + + def test_enable_disable_slicing(self): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + torch.manual_seed(0) + model = self.model_class(**init_dict).to(torch_device) + + inputs_dict.update({"return_dict": False}) + + torch.manual_seed(0) + output_without_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + torch.manual_seed(0) + model.enable_slicing() + output_with_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertLess( + (output_without_slicing.detach().cpu().numpy() - output_with_slicing.detach().cpu().numpy()).max(), + 0.5, + "VAE slicing should not affect the inference results", + ) + + torch.manual_seed(0) + model.disable_slicing() + output_without_slicing_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertEqual( + output_without_slicing.detach().cpu().numpy().all(), + output_without_slicing_2.detach().cpu().numpy().all(), + "Without slicing outputs should match with the outputs when slicing is manually disabled.", + ) + + @unittest.skip("Test unsupported.") + def test_forward_with_norm_groups(self): + pass + + @unittest.skip("No attention module used in this model") + def test_set_attn_processor_for_determinism(self): + return + + +@slow +class AutoencoderOobleckIntegrationTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + backend_empty_cache(torch_device) + + def _load_datasamples(self, num_samples): + ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True + ) + # automatic decoding with librispeech + speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + + return torch.nn.utils.rnn.pad_sequence( + [torch.from_numpy(x["array"]) for x in speech_samples], batch_first=True + ) + + def get_audio(self, audio_sample_size=2097152, fp16=False): + dtype = torch.float16 if fp16 else torch.float32 + audio = self._load_datasamples(2).to(torch_device).to(dtype) + + # pad / crop to audio_sample_size + audio = torch.nn.functional.pad(audio[:, :audio_sample_size], pad=(0, audio_sample_size - audio.shape[-1])) + + # todo channel + audio = audio.unsqueeze(1).repeat(1, 2, 1).to(torch_device) + + return audio + + def get_oobleck_vae_model(self, model_id="stabilityai/stable-audio-open-1.0", fp16=False): + torch_dtype = torch.float16 if fp16 else torch.float32 + + model = AutoencoderOobleck.from_pretrained( + model_id, + subfolder="vae", + torch_dtype=torch_dtype, + ) + model.to(torch_device) + + return model + + def get_generator(self, seed=0): + generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda" + if torch_device != "mps": + return torch.Generator(device=generator_device).manual_seed(seed) + return torch.manual_seed(seed) + + @parameterized.expand( + [ + # fmt: off + [33, [1.193e-4, 6.56e-05, 1.314e-4, 3.80e-05, -4.01e-06], 0.001192], + [44, [2.77e-05, -2.65e-05, 1.18e-05, -6.94e-05, -9.57e-05], 0.001196], + # fmt: on + ] + ) + def test_stable_diffusion(self, seed, expected_slice, expected_mean_absolute_diff): + model = self.get_oobleck_vae_model() + audio = self.get_audio() + generator = self.get_generator(seed) + + with torch.no_grad(): + sample = model(audio, generator=generator, sample_posterior=True).sample + + assert sample.shape == audio.shape + assert ((sample - audio).abs().mean() - expected_mean_absolute_diff).abs() <= 1e-6 + + output_slice = sample[-1, 1, 5:10].cpu() + expected_output_slice = torch.tensor(expected_slice) + + assert torch_all_close(output_slice, expected_output_slice, atol=1e-5) + + def test_stable_diffusion_mode(self): + model = self.get_oobleck_vae_model() + audio = self.get_audio() + + with torch.no_grad(): + sample = model(audio, sample_posterior=False).sample + + assert sample.shape == audio.shape + + @parameterized.expand( + [ + # fmt: off + [33, [1.193e-4, 6.56e-05, 1.314e-4, 3.80e-05, -4.01e-06], 0.001192], + [44, [2.77e-05, -2.65e-05, 1.18e-05, -6.94e-05, -9.57e-05], 0.001196], + # fmt: on + ] + ) + def test_stable_diffusion_encode_decode(self, seed, expected_slice, expected_mean_absolute_diff): + model = self.get_oobleck_vae_model() + audio = self.get_audio() + generator = self.get_generator(seed) + + with torch.no_grad(): + x = audio + posterior = model.encode(x).latent_dist + z = posterior.sample(generator=generator) + sample = model.decode(z).sample + + # (batch_size, latent_dim, sequence_length) + assert posterior.mean.shape == (audio.shape[0], model.config.decoder_input_channels, 1024) + + assert sample.shape == audio.shape + assert ((sample - audio).abs().mean() - expected_mean_absolute_diff).abs() <= 1e-6 + + output_slice = sample[-1, 1, 5:10].cpu() + expected_output_slice = torch.tensor(expected_slice) + + assert torch_all_close(output_slice, expected_output_slice, atol=1e-5) diff --git a/tests/models/autoencoders/test_models_autoencoder_tiny.py b/tests/models/autoencoders/test_models_autoencoder_tiny.py new file mode 100644 index 000000000000..4de3822fa835 --- /dev/null +++ b/tests/models/autoencoders/test_models_autoencoder_tiny.py @@ -0,0 +1,251 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import gc +import unittest + +import torch +from parameterized import parameterized + +from diffusers import AutoencoderTiny +from diffusers.utils.testing_utils import ( + backend_empty_cache, + enable_full_determinism, + floats_tensor, + load_hf_numpy, + slow, + torch_all_close, + torch_device, +) + +from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin + + +enable_full_determinism() + + +class AutoencoderTinyTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): + model_class = AutoencoderTiny + main_input_name = "sample" + base_precision = 1e-2 + + def get_autoencoder_tiny_config(self, block_out_channels=None): + block_out_channels = (len(block_out_channels) * [32]) if block_out_channels is not None else [32, 32] + init_dict = { + "in_channels": 3, + "out_channels": 3, + "encoder_block_out_channels": block_out_channels, + "decoder_block_out_channels": block_out_channels, + "num_encoder_blocks": [b // min(block_out_channels) for b in block_out_channels], + "num_decoder_blocks": [b // min(block_out_channels) for b in reversed(block_out_channels)], + } + return init_dict + + @property + def dummy_input(self): + batch_size = 4 + num_channels = 3 + sizes = (32, 32) + + image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device) + + return {"sample": image} + + @property + def input_shape(self): + return (3, 32, 32) + + @property + def output_shape(self): + return (3, 32, 32) + + def prepare_init_args_and_inputs_for_common(self): + init_dict = self.get_autoencoder_tiny_config() + inputs_dict = self.dummy_input + return init_dict, inputs_dict + + @unittest.skip("Model doesn't yet support smaller resolution.") + def test_enable_disable_tiling(self): + pass + + def test_enable_disable_slicing(self): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + torch.manual_seed(0) + model = self.model_class(**init_dict).to(torch_device) + + inputs_dict.update({"return_dict": False}) + + torch.manual_seed(0) + output_without_slicing = model(**inputs_dict)[0] + + torch.manual_seed(0) + model.enable_slicing() + output_with_slicing = model(**inputs_dict)[0] + + self.assertLess( + (output_without_slicing.detach().cpu().numpy() - output_with_slicing.detach().cpu().numpy()).max(), + 0.5, + "VAE slicing should not affect the inference results", + ) + + torch.manual_seed(0) + model.disable_slicing() + output_without_slicing_2 = model(**inputs_dict)[0] + + self.assertEqual( + output_without_slicing.detach().cpu().numpy().all(), + output_without_slicing_2.detach().cpu().numpy().all(), + "Without slicing outputs should match with the outputs when slicing is manually disabled.", + ) + + @unittest.skip("Test not supported.") + def test_outputs_equivalence(self): + pass + + @unittest.skip("Test not supported.") + def test_forward_with_norm_groups(self): + pass + + def test_gradient_checkpointing_is_applied(self): + expected_set = {"DecoderTiny", "EncoderTiny"} + super().test_gradient_checkpointing_is_applied(expected_set=expected_set) + + def test_effective_gradient_checkpointing(self): + if not self.model_class._supports_gradient_checkpointing: + return # Skip test if model does not support gradient checkpointing + + # enable deterministic behavior for gradient checkpointing + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + inputs_dict_copy = copy.deepcopy(inputs_dict) + torch.manual_seed(0) + model = self.model_class(**init_dict) + model.to(torch_device) + + assert not model.is_gradient_checkpointing and model.training + + out = model(**inputs_dict).sample + # run the backwards pass on the model. For backwards pass, for simplicity purpose, + # we won't calculate the loss and rather backprop on out.sum() + model.zero_grad() + + labels = torch.randn_like(out) + loss = (out - labels).mean() + loss.backward() + + # re-instantiate the model now enabling gradient checkpointing + torch.manual_seed(0) + model_2 = self.model_class(**init_dict) + # clone model + model_2.load_state_dict(model.state_dict()) + model_2.to(torch_device) + model_2.enable_gradient_checkpointing() + + assert model_2.is_gradient_checkpointing and model_2.training + + out_2 = model_2(**inputs_dict_copy).sample + # run the backwards pass on the model. For backwards pass, for simplicity purpose, + # we won't calculate the loss and rather backprop on out.sum() + model_2.zero_grad() + loss_2 = (out_2 - labels).mean() + loss_2.backward() + + # compare the output and parameters gradients + self.assertTrue((loss - loss_2).abs() < 1e-3) + named_params = dict(model.named_parameters()) + named_params_2 = dict(model_2.named_parameters()) + + for name, param in named_params.items(): + if "encoder.layers" in name: + continue + self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=3e-2)) + + +@slow +class AutoencoderTinyIntegrationTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + backend_empty_cache(torch_device) + + def get_file_format(self, seed, shape): + return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy" + + def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False): + dtype = torch.float16 if fp16 else torch.float32 + image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype) + return image + + def get_sd_vae_model(self, model_id="hf-internal-testing/taesd-diffusers", fp16=False): + torch_dtype = torch.float16 if fp16 else torch.float32 + + model = AutoencoderTiny.from_pretrained(model_id, torch_dtype=torch_dtype) + model.to(torch_device).eval() + return model + + @parameterized.expand( + [ + [(1, 4, 73, 97), (1, 3, 584, 776)], + [(1, 4, 97, 73), (1, 3, 776, 584)], + [(1, 4, 49, 65), (1, 3, 392, 520)], + [(1, 4, 65, 49), (1, 3, 520, 392)], + [(1, 4, 49, 49), (1, 3, 392, 392)], + ] + ) + def test_tae_tiling(self, in_shape, out_shape): + model = self.get_sd_vae_model() + model.enable_tiling() + with torch.no_grad(): + zeros = torch.zeros(in_shape).to(torch_device) + dec = model.decode(zeros).sample + assert dec.shape == out_shape + + def test_stable_diffusion(self): + model = self.get_sd_vae_model() + image = self.get_sd_image(seed=33) + + with torch.no_grad(): + sample = model(image).sample + + assert sample.shape == image.shape + + output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() + expected_output_slice = torch.tensor([0.0093, 0.6385, -0.1274, 0.1631, -0.1762, 0.5232, -0.3108, -0.0382]) + + assert torch_all_close(output_slice, expected_output_slice, atol=3e-3) + + @parameterized.expand([(True,), (False,)]) + def test_tae_roundtrip(self, enable_tiling): + # load the autoencoder + model = self.get_sd_vae_model() + if enable_tiling: + model.enable_tiling() + + # make a black image with a white square in the middle, + # which is large enough to split across multiple tiles + image = -torch.ones(1, 3, 1024, 1024, device=torch_device) + image[..., 256:768, 256:768] = 1.0 + + # round-trip the image through the autoencoder + with torch.no_grad(): + sample = model(image).sample + + # the autoencoder reconstruction should match original image, sorta + def downscale(x): + return torch.nn.functional.avg_pool2d(x, model.spatial_scale_factor) + + assert torch_all_close(downscale(sample), downscale(image), atol=0.125) diff --git a/tests/models/autoencoders/test_models_consistency_decoder_vae.py b/tests/models/autoencoders/test_models_consistency_decoder_vae.py new file mode 100644 index 000000000000..77977a78d83b --- /dev/null +++ b/tests/models/autoencoders/test_models_consistency_decoder_vae.py @@ -0,0 +1,300 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import unittest + +import numpy as np +import torch + +from diffusers import ConsistencyDecoderVAE, StableDiffusionPipeline +from diffusers.utils.testing_utils import ( + enable_full_determinism, + load_image, + slow, + torch_all_close, + torch_device, +) +from diffusers.utils.torch_utils import randn_tensor + +from ..test_modeling_common import ModelTesterMixin + + +enable_full_determinism() + + +class ConsistencyDecoderVAETests(ModelTesterMixin, unittest.TestCase): + model_class = ConsistencyDecoderVAE + main_input_name = "sample" + base_precision = 1e-2 + forward_requires_fresh_args = True + + def get_consistency_vae_config(self, block_out_channels=None, norm_num_groups=None): + block_out_channels = block_out_channels or [2, 4] + norm_num_groups = norm_num_groups or 2 + return { + "encoder_block_out_channels": block_out_channels, + "encoder_in_channels": 3, + "encoder_out_channels": 4, + "encoder_down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels), + "decoder_add_attention": False, + "decoder_block_out_channels": block_out_channels, + "decoder_down_block_types": ["ResnetDownsampleBlock2D"] * len(block_out_channels), + "decoder_downsample_padding": 1, + "decoder_in_channels": 7, + "decoder_layers_per_block": 1, + "decoder_norm_eps": 1e-05, + "decoder_norm_num_groups": norm_num_groups, + "encoder_norm_num_groups": norm_num_groups, + "decoder_num_train_timesteps": 1024, + "decoder_out_channels": 6, + "decoder_resnet_time_scale_shift": "scale_shift", + "decoder_time_embedding_type": "learned", + "decoder_up_block_types": ["ResnetUpsampleBlock2D"] * len(block_out_channels), + "scaling_factor": 1, + "latent_channels": 4, + } + + def inputs_dict(self, seed=None): + if seed is None: + generator = torch.Generator("cpu").manual_seed(0) + else: + generator = torch.Generator("cpu").manual_seed(seed) + image = randn_tensor((4, 3, 32, 32), generator=generator, device=torch.device(torch_device)) + + return {"sample": image, "generator": generator} + + @property + def input_shape(self): + return (3, 32, 32) + + @property + def output_shape(self): + return (3, 32, 32) + + @property + def init_dict(self): + return self.get_consistency_vae_config() + + def prepare_init_args_and_inputs_for_common(self): + return self.init_dict, self.inputs_dict() + + def test_enable_disable_tiling(self): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + torch.manual_seed(0) + model = self.model_class(**init_dict).to(torch_device) + + inputs_dict.update({"return_dict": False}) + _ = inputs_dict.pop("generator") + + torch.manual_seed(0) + output_without_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + torch.manual_seed(0) + model.enable_tiling() + output_with_tiling = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertLess( + (output_without_tiling.detach().cpu().numpy() - output_with_tiling.detach().cpu().numpy()).max(), + 0.5, + "VAE tiling should not affect the inference results", + ) + + torch.manual_seed(0) + model.disable_tiling() + output_without_tiling_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertEqual( + output_without_tiling.detach().cpu().numpy().all(), + output_without_tiling_2.detach().cpu().numpy().all(), + "Without tiling outputs should match with the outputs when tiling is manually disabled.", + ) + + def test_enable_disable_slicing(self): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + torch.manual_seed(0) + model = self.model_class(**init_dict).to(torch_device) + + inputs_dict.update({"return_dict": False}) + _ = inputs_dict.pop("generator") + + torch.manual_seed(0) + output_without_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + torch.manual_seed(0) + model.enable_slicing() + output_with_slicing = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertLess( + (output_without_slicing.detach().cpu().numpy() - output_with_slicing.detach().cpu().numpy()).max(), + 0.5, + "VAE slicing should not affect the inference results", + ) + + torch.manual_seed(0) + model.disable_slicing() + output_without_slicing_2 = model(**inputs_dict, generator=torch.manual_seed(0))[0] + + self.assertEqual( + output_without_slicing.detach().cpu().numpy().all(), + output_without_slicing_2.detach().cpu().numpy().all(), + "Without slicing outputs should match with the outputs when slicing is manually disabled.", + ) + + +@slow +class ConsistencyDecoderVAEIntegrationTests(unittest.TestCase): + def setUp(self): + # clean up the VRAM before each test + super().setUp() + gc.collect() + torch.cuda.empty_cache() + + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + @torch.no_grad() + def test_encode_decode(self): + vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder") # TODO - update + vae.to(torch_device) + + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/img2img/sketch-mountains-input.jpg" + ).resize((256, 256)) + image = torch.from_numpy(np.array(image).transpose(2, 0, 1).astype(np.float32) / 127.5 - 1)[None, :, :, :].to( + torch_device + ) + + latent = vae.encode(image).latent_dist.mean + + sample = vae.decode(latent, generator=torch.Generator("cpu").manual_seed(0)).sample + + actual_output = sample[0, :2, :2, :2].flatten().cpu() + expected_output = torch.tensor([-0.0141, -0.0014, 0.0115, 0.0086, 0.1051, 0.1053, 0.1031, 0.1024]) + + assert torch_all_close(actual_output, expected_output, atol=5e-3) + + def test_sd(self): + vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder") # TODO - update + pipe = StableDiffusionPipeline.from_pretrained( + "stable-diffusion-v1-5/stable-diffusion-v1-5", vae=vae, safety_checker=None + ) + pipe.to(torch_device) + + out = pipe( + "horse", + num_inference_steps=2, + output_type="pt", + generator=torch.Generator("cpu").manual_seed(0), + ).images[0] + + actual_output = out[:2, :2, :2].flatten().cpu() + expected_output = torch.tensor([0.7686, 0.8228, 0.6489, 0.7455, 0.8661, 0.8797, 0.8241, 0.8759]) + + assert torch_all_close(actual_output, expected_output, atol=5e-3) + + def test_encode_decode_f16(self): + vae = ConsistencyDecoderVAE.from_pretrained( + "openai/consistency-decoder", torch_dtype=torch.float16 + ) # TODO - update + vae.to(torch_device) + + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/img2img/sketch-mountains-input.jpg" + ).resize((256, 256)) + image = ( + torch.from_numpy(np.array(image).transpose(2, 0, 1).astype(np.float32) / 127.5 - 1)[None, :, :, :] + .half() + .to(torch_device) + ) + + latent = vae.encode(image).latent_dist.mean + + sample = vae.decode(latent, generator=torch.Generator("cpu").manual_seed(0)).sample + + actual_output = sample[0, :2, :2, :2].flatten().cpu() + expected_output = torch.tensor( + [-0.0111, -0.0125, -0.0017, -0.0007, 0.1257, 0.1465, 0.1450, 0.1471], + dtype=torch.float16, + ) + + assert torch_all_close(actual_output, expected_output, atol=5e-3) + + def test_sd_f16(self): + vae = ConsistencyDecoderVAE.from_pretrained( + "openai/consistency-decoder", torch_dtype=torch.float16 + ) # TODO - update + pipe = StableDiffusionPipeline.from_pretrained( + "stable-diffusion-v1-5/stable-diffusion-v1-5", + torch_dtype=torch.float16, + vae=vae, + safety_checker=None, + ) + pipe.to(torch_device) + + out = pipe( + "horse", + num_inference_steps=2, + output_type="pt", + generator=torch.Generator("cpu").manual_seed(0), + ).images[0] + + actual_output = out[:2, :2, :2].flatten().cpu() + expected_output = torch.tensor( + [0.0000, 0.0249, 0.0000, 0.0000, 0.1709, 0.2773, 0.0471, 0.1035], + dtype=torch.float16, + ) + + assert torch_all_close(actual_output, expected_output, atol=5e-3) + + def test_vae_tiling(self): + vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16) + pipe = StableDiffusionPipeline.from_pretrained( + "stable-diffusion-v1-5/stable-diffusion-v1-5", vae=vae, safety_checker=None, torch_dtype=torch.float16 + ) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + out_1 = pipe( + "horse", + num_inference_steps=2, + output_type="pt", + generator=torch.Generator("cpu").manual_seed(0), + ).images[0] + + # make sure tiled vae decode yields the same result + pipe.enable_vae_tiling() + out_2 = pipe( + "horse", + num_inference_steps=2, + output_type="pt", + generator=torch.Generator("cpu").manual_seed(0), + ).images[0] + + assert torch_all_close(out_1, out_2, atol=5e-3) + + # test that tiled decode works with various shapes + shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)] + with torch.no_grad(): + for shape in shapes: + image = torch.zeros(shape, device=torch_device, dtype=pipe.vae.dtype) + pipe.vae.decode(image) diff --git a/tests/models/autoencoders/test_models_vae.py b/tests/models/autoencoders/test_models_vae.py deleted file mode 100644 index d475160cc796..000000000000 --- a/tests/models/autoencoders/test_models_vae.py +++ /dev/null @@ -1,1249 +0,0 @@ -# coding=utf-8 -# Copyright 2024 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from datasets import load_dataset -from parameterized import parameterized - -from diffusers import ( - AsymmetricAutoencoderKL, - AutoencoderKL, - AutoencoderKLTemporalDecoder, - AutoencoderOobleck, - AutoencoderTiny, - ConsistencyDecoderVAE, - StableDiffusionPipeline, -) -from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.loading_utils import load_image -from diffusers.utils.testing_utils import ( - backend_empty_cache, - enable_full_determinism, - floats_tensor, - is_peft_available, - load_hf_numpy, - require_peft_backend, - require_torch_accelerator, - require_torch_accelerator_with_fp16, - require_torch_gpu, - skip_mps, - slow, - torch_all_close, - torch_device, -) -from diffusers.utils.torch_utils import randn_tensor - -from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin - - -if is_peft_available(): - from peft import LoraConfig - - -enable_full_determinism() - - -def get_autoencoder_kl_config(block_out_channels=None, norm_num_groups=None): - block_out_channels = block_out_channels or [2, 4] - norm_num_groups = norm_num_groups or 2 - init_dict = { - "block_out_channels": block_out_channels, - "in_channels": 3, - "out_channels": 3, - "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels), - "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels), - "latent_channels": 4, - "norm_num_groups": norm_num_groups, - } - return init_dict - - -def get_asym_autoencoder_kl_config(block_out_channels=None, norm_num_groups=None): - block_out_channels = block_out_channels or [2, 4] - norm_num_groups = norm_num_groups or 2 - init_dict = { - "in_channels": 3, - "out_channels": 3, - "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels), - "down_block_out_channels": block_out_channels, - "layers_per_down_block": 1, - "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels), - "up_block_out_channels": block_out_channels, - "layers_per_up_block": 1, - "act_fn": "silu", - "latent_channels": 4, - "norm_num_groups": norm_num_groups, - "sample_size": 32, - "scaling_factor": 0.18215, - } - return init_dict - - -def get_autoencoder_tiny_config(block_out_channels=None): - block_out_channels = (len(block_out_channels) * [32]) if block_out_channels is not None else [32, 32] - init_dict = { - "in_channels": 3, - "out_channels": 3, - "encoder_block_out_channels": block_out_channels, - "decoder_block_out_channels": block_out_channels, - "num_encoder_blocks": [b // min(block_out_channels) for b in block_out_channels], - "num_decoder_blocks": [b // min(block_out_channels) for b in reversed(block_out_channels)], - } - return init_dict - - -def get_consistency_vae_config(block_out_channels=None, norm_num_groups=None): - block_out_channels = block_out_channels or [2, 4] - norm_num_groups = norm_num_groups or 2 - return { - "encoder_block_out_channels": block_out_channels, - "encoder_in_channels": 3, - "encoder_out_channels": 4, - "encoder_down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels), - "decoder_add_attention": False, - "decoder_block_out_channels": block_out_channels, - "decoder_down_block_types": ["ResnetDownsampleBlock2D"] * len(block_out_channels), - "decoder_downsample_padding": 1, - "decoder_in_channels": 7, - "decoder_layers_per_block": 1, - "decoder_norm_eps": 1e-05, - "decoder_norm_num_groups": norm_num_groups, - "encoder_norm_num_groups": norm_num_groups, - "decoder_num_train_timesteps": 1024, - "decoder_out_channels": 6, - "decoder_resnet_time_scale_shift": "scale_shift", - "decoder_time_embedding_type": "learned", - "decoder_up_block_types": ["ResnetUpsampleBlock2D"] * len(block_out_channels), - "scaling_factor": 1, - "latent_channels": 4, - } - - -def get_autoencoder_oobleck_config(block_out_channels=None): - init_dict = { - "encoder_hidden_size": 12, - "decoder_channels": 12, - "decoder_input_channels": 6, - "audio_channels": 2, - "downsampling_ratios": [2, 4], - "channel_multiples": [1, 2], - } - return init_dict - - -class AutoencoderKLTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): - model_class = AutoencoderKL - main_input_name = "sample" - base_precision = 1e-2 - - @property - def dummy_input(self): - batch_size = 4 - num_channels = 3 - sizes = (32, 32) - - image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device) - - return {"sample": image} - - @property - def input_shape(self): - return (3, 32, 32) - - @property - def output_shape(self): - return (3, 32, 32) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = get_autoencoder_kl_config() - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - @unittest.skip("Not tested.") - def test_forward_signature(self): - pass - - @unittest.skip("Not tested.") - def test_training(self): - pass - - def test_gradient_checkpointing_is_applied(self): - expected_set = {"Decoder", "Encoder"} - super().test_gradient_checkpointing_is_applied(expected_set=expected_set) - - def test_from_pretrained_hub(self): - model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True) - self.assertIsNotNone(model) - self.assertEqual(len(loading_info["missing_keys"]), 0) - - model.to(torch_device) - image = model(**self.dummy_input) - - assert image is not None, "Make sure output is not None" - - def test_output_pretrained(self): - model = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy") - model = model.to(torch_device) - model.eval() - - # Keep generator on CPU for non-CUDA devices to compare outputs with CPU result tensors - generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda" - if torch_device != "mps": - generator = torch.Generator(device=generator_device).manual_seed(0) - else: - generator = torch.manual_seed(0) - - image = torch.randn( - 1, - model.config.in_channels, - model.config.sample_size, - model.config.sample_size, - generator=torch.manual_seed(0), - ) - image = image.to(torch_device) - with torch.no_grad(): - output = model(image, sample_posterior=True, generator=generator).sample - - output_slice = output[0, -1, -3:, -3:].flatten().cpu() - - # Since the VAE Gaussian prior's generator is seeded on the appropriate device, - # the expected output slices are not the same for CPU and GPU. - if torch_device == "mps": - expected_output_slice = torch.tensor( - [ - -4.0078e-01, - -3.8323e-04, - -1.2681e-01, - -1.1462e-01, - 2.0095e-01, - 1.0893e-01, - -8.8247e-02, - -3.0361e-01, - -9.8644e-03, - ] - ) - elif generator_device == "cpu": - expected_output_slice = torch.tensor( - [ - -0.1352, - 0.0878, - 0.0419, - -0.0818, - -0.1069, - 0.0688, - -0.1458, - -0.4446, - -0.0026, - ] - ) - else: - expected_output_slice = torch.tensor( - [ - -0.2421, - 0.4642, - 0.2507, - -0.0438, - 0.0682, - 0.3160, - -0.2018, - -0.0727, - 0.2485, - ] - ) - - self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2)) - - @require_peft_backend - def test_lora_adapter(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - vae = self.model_class(**init_dict) - - target_modules_vae = [ - "conv1", - "conv2", - "conv_in", - "conv_shortcut", - "conv", - "conv_out", - "skip_conv_1", - "skip_conv_2", - "skip_conv_3", - "skip_conv_4", - "to_k", - "to_q", - "to_v", - "to_out.0", - ] - vae_lora_config = LoraConfig( - r=16, - init_lora_weights="gaussian", - target_modules=target_modules_vae, - ) - - vae.add_adapter(vae_lora_config, adapter_name="vae_lora") - active_lora = vae.active_adapters() - self.assertTrue(len(active_lora) == 1) - self.assertTrue(active_lora[0] == "vae_lora") - - -class AsymmetricAutoencoderKLTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): - model_class = AsymmetricAutoencoderKL - main_input_name = "sample" - base_precision = 1e-2 - - @property - def dummy_input(self): - batch_size = 4 - num_channels = 3 - sizes = (32, 32) - - image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device) - mask = torch.ones((batch_size, 1) + sizes).to(torch_device) - - return {"sample": image, "mask": mask} - - @property - def input_shape(self): - return (3, 32, 32) - - @property - def output_shape(self): - return (3, 32, 32) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = get_asym_autoencoder_kl_config() - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - @unittest.skip("Not tested.") - def test_forward_signature(self): - pass - - @unittest.skip("Not tested.") - def test_forward_with_norm_groups(self): - pass - - -class AutoencoderTinyTests(ModelTesterMixin, unittest.TestCase): - model_class = AutoencoderTiny - main_input_name = "sample" - base_precision = 1e-2 - - @property - def dummy_input(self): - batch_size = 4 - num_channels = 3 - sizes = (32, 32) - - image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device) - - return {"sample": image} - - @property - def input_shape(self): - return (3, 32, 32) - - @property - def output_shape(self): - return (3, 32, 32) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = get_autoencoder_tiny_config() - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - @unittest.skip("Not tested.") - def test_outputs_equivalence(self): - pass - - def test_gradient_checkpointing_is_applied(self): - expected_set = {"DecoderTiny", "EncoderTiny"} - super().test_gradient_checkpointing_is_applied(expected_set=expected_set) - - @unittest.skip( - "Gradient checkpointing is supported but this test doesn't apply to this class because it's forward is a bit different from the rest." - ) - def test_effective_gradient_checkpointing(self): - pass - - -class ConsistencyDecoderVAETests(ModelTesterMixin, unittest.TestCase): - model_class = ConsistencyDecoderVAE - main_input_name = "sample" - base_precision = 1e-2 - forward_requires_fresh_args = True - - def inputs_dict(self, seed=None): - if seed is None: - generator = torch.Generator("cpu").manual_seed(0) - else: - generator = torch.Generator("cpu").manual_seed(seed) - image = randn_tensor((4, 3, 32, 32), generator=generator, device=torch.device(torch_device)) - - return {"sample": image, "generator": generator} - - @property - def input_shape(self): - return (3, 32, 32) - - @property - def output_shape(self): - return (3, 32, 32) - - @property - def init_dict(self): - return get_consistency_vae_config() - - def prepare_init_args_and_inputs_for_common(self): - return self.init_dict, self.inputs_dict() - - @unittest.skip - def test_training(self): - ... - - @unittest.skip - def test_ema_training(self): - ... - - -class AutoencoderKLTemporalDecoderFastTests(ModelTesterMixin, unittest.TestCase): - model_class = AutoencoderKLTemporalDecoder - main_input_name = "sample" - base_precision = 1e-2 - - @property - def dummy_input(self): - batch_size = 3 - num_channels = 3 - sizes = (32, 32) - - image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device) - num_frames = 3 - - return {"sample": image, "num_frames": num_frames} - - @property - def input_shape(self): - return (3, 32, 32) - - @property - def output_shape(self): - return (3, 32, 32) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "block_out_channels": [32, 64], - "in_channels": 3, - "out_channels": 3, - "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"], - "latent_channels": 4, - "layers_per_block": 2, - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - @unittest.skip("Not tested.") - def test_forward_signature(self): - pass - - @unittest.skip("Not tested.") - def test_training(self): - pass - - def test_gradient_checkpointing_is_applied(self): - expected_set = {"Encoder", "TemporalDecoder"} - super().test_gradient_checkpointing_is_applied(expected_set=expected_set) - - -class AutoencoderOobleckTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): - model_class = AutoencoderOobleck - main_input_name = "sample" - base_precision = 1e-2 - - @property - def dummy_input(self): - batch_size = 4 - num_channels = 2 - seq_len = 24 - - waveform = floats_tensor((batch_size, num_channels, seq_len)).to(torch_device) - - return {"sample": waveform, "sample_posterior": False} - - @property - def input_shape(self): - return (2, 24) - - @property - def output_shape(self): - return (2, 24) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = get_autoencoder_oobleck_config() - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - @unittest.skip("Not tested.") - def test_forward_signature(self): - pass - - @unittest.skip("Not tested.") - def test_forward_with_norm_groups(self): - pass - - @unittest.skip("No attention module used in this model") - def test_set_attn_processor_for_determinism(self): - return - - -@slow -class AutoencoderTinyIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def get_file_format(self, seed, shape): - return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy" - - def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False): - dtype = torch.float16 if fp16 else torch.float32 - image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype) - return image - - def get_sd_vae_model(self, model_id="hf-internal-testing/taesd-diffusers", fp16=False): - torch_dtype = torch.float16 if fp16 else torch.float32 - - model = AutoencoderTiny.from_pretrained(model_id, torch_dtype=torch_dtype) - model.to(torch_device).eval() - return model - - @parameterized.expand( - [ - [(1, 4, 73, 97), (1, 3, 584, 776)], - [(1, 4, 97, 73), (1, 3, 776, 584)], - [(1, 4, 49, 65), (1, 3, 392, 520)], - [(1, 4, 65, 49), (1, 3, 520, 392)], - [(1, 4, 49, 49), (1, 3, 392, 392)], - ] - ) - def test_tae_tiling(self, in_shape, out_shape): - model = self.get_sd_vae_model() - model.enable_tiling() - with torch.no_grad(): - zeros = torch.zeros(in_shape).to(torch_device) - dec = model.decode(zeros).sample - assert dec.shape == out_shape - - def test_stable_diffusion(self): - model = self.get_sd_vae_model() - image = self.get_sd_image(seed=33) - - with torch.no_grad(): - sample = model(image).sample - - assert sample.shape == image.shape - - output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() - expected_output_slice = torch.tensor([0.0093, 0.6385, -0.1274, 0.1631, -0.1762, 0.5232, -0.3108, -0.0382]) - - assert torch_all_close(output_slice, expected_output_slice, atol=3e-3) - - @parameterized.expand([(True,), (False,)]) - def test_tae_roundtrip(self, enable_tiling): - # load the autoencoder - model = self.get_sd_vae_model() - if enable_tiling: - model.enable_tiling() - - # make a black image with a white square in the middle, - # which is large enough to split across multiple tiles - image = -torch.ones(1, 3, 1024, 1024, device=torch_device) - image[..., 256:768, 256:768] = 1.0 - - # round-trip the image through the autoencoder - with torch.no_grad(): - sample = model(image).sample - - # the autoencoder reconstruction should match original image, sorta - def downscale(x): - return torch.nn.functional.avg_pool2d(x, model.spatial_scale_factor) - - assert torch_all_close(downscale(sample), downscale(image), atol=0.125) - - -@slow -class AutoencoderKLIntegrationTests(unittest.TestCase): - def get_file_format(self, seed, shape): - return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy" - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False): - dtype = torch.float16 if fp16 else torch.float32 - image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype) - return image - - def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False): - revision = "fp16" if fp16 else None - torch_dtype = torch.float16 if fp16 else torch.float32 - - model = AutoencoderKL.from_pretrained( - model_id, - subfolder="vae", - torch_dtype=torch_dtype, - revision=revision, - ) - model.to(torch_device) - - return model - - def get_generator(self, seed=0): - generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda" - if torch_device != "mps": - return torch.Generator(device=generator_device).manual_seed(seed) - return torch.manual_seed(seed) - - @parameterized.expand( - [ - # fmt: off - [ - 33, - [-0.1556, 0.9848, -0.0410, -0.0642, -0.2685, 0.8381, -0.2004, -0.0700], - [-0.2395, 0.0098, 0.0102, -0.0709, -0.2840, -0.0274, -0.0718, -0.1824], - ], - [ - 47, - [-0.2376, 0.1200, 0.1337, -0.4830, -0.2504, -0.0759, -0.0486, -0.4077], - [0.0350, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131], - ], - # fmt: on - ] - ) - def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps): - model = self.get_sd_vae_model() - image = self.get_sd_image(seed) - generator = self.get_generator(seed) - - with torch.no_grad(): - sample = model(image, generator=generator, sample_posterior=True).sample - - assert sample.shape == image.shape - - output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=3e-3) - - @parameterized.expand( - [ - # fmt: off - [33, [-0.0513, 0.0289, 1.3799, 0.2166, -0.2573, -0.0871, 0.5103, -0.0999]], - [47, [-0.4128, -0.1320, -0.3704, 0.1965, -0.4116, -0.2332, -0.3340, 0.2247]], - # fmt: on - ] - ) - @require_torch_accelerator_with_fp16 - def test_stable_diffusion_fp16(self, seed, expected_slice): - model = self.get_sd_vae_model(fp16=True) - image = self.get_sd_image(seed, fp16=True) - generator = self.get_generator(seed) - - with torch.no_grad(): - sample = model(image, generator=generator, sample_posterior=True).sample - - assert sample.shape == image.shape - - output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=1e-2) - - @parameterized.expand( - [ - # fmt: off - [ - 33, - [-0.1609, 0.9866, -0.0487, -0.0777, -0.2716, 0.8368, -0.2055, -0.0814], - [-0.2395, 0.0098, 0.0102, -0.0709, -0.2840, -0.0274, -0.0718, -0.1824], - ], - [ - 47, - [-0.2377, 0.1147, 0.1333, -0.4841, -0.2506, -0.0805, -0.0491, -0.4085], - [0.0350, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131], - ], - # fmt: on - ] - ) - def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps): - model = self.get_sd_vae_model() - image = self.get_sd_image(seed) - - with torch.no_grad(): - sample = model(image).sample - - assert sample.shape == image.shape - - output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=3e-3) - - @parameterized.expand( - [ - # fmt: off - [13, [-0.2051, -0.1803, -0.2311, -0.2114, -0.3292, -0.3574, -0.2953, -0.3323]], - [37, [-0.2632, -0.2625, -0.2199, -0.2741, -0.4539, -0.4990, -0.3720, -0.4925]], - # fmt: on - ] - ) - @require_torch_accelerator - @skip_mps - def test_stable_diffusion_decode(self, seed, expected_slice): - model = self.get_sd_vae_model() - encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64)) - - with torch.no_grad(): - sample = model.decode(encoding).sample - - assert list(sample.shape) == [3, 3, 512, 512] - - output_slice = sample[-1, -2:, :2, -2:].flatten().cpu() - expected_output_slice = torch.tensor(expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=1e-3) - - @parameterized.expand( - [ - # fmt: off - [27, [-0.0369, 0.0207, -0.0776, -0.0682, -0.1747, -0.1930, -0.1465, -0.2039]], - [16, [-0.1628, -0.2134, -0.2747, -0.2642, -0.3774, -0.4404, -0.3687, -0.4277]], - # fmt: on - ] - ) - @require_torch_accelerator_with_fp16 - def test_stable_diffusion_decode_fp16(self, seed, expected_slice): - model = self.get_sd_vae_model(fp16=True) - encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True) - - with torch.no_grad(): - sample = model.decode(encoding).sample - - assert list(sample.shape) == [3, 3, 512, 512] - - output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=5e-3) - - @parameterized.expand([(13,), (16,), (27,)]) - @require_torch_gpu - @unittest.skipIf( - not is_xformers_available(), - reason="xformers is not required when using PyTorch 2.0.", - ) - def test_stable_diffusion_decode_xformers_vs_2_0_fp16(self, seed): - model = self.get_sd_vae_model(fp16=True) - encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True) - - with torch.no_grad(): - sample = model.decode(encoding).sample - - model.enable_xformers_memory_efficient_attention() - with torch.no_grad(): - sample_2 = model.decode(encoding).sample - - assert list(sample.shape) == [3, 3, 512, 512] - - assert torch_all_close(sample, sample_2, atol=1e-1) - - @parameterized.expand([(13,), (16,), (37,)]) - @require_torch_gpu - @unittest.skipIf( - not is_xformers_available(), - reason="xformers is not required when using PyTorch 2.0.", - ) - def test_stable_diffusion_decode_xformers_vs_2_0(self, seed): - model = self.get_sd_vae_model() - encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64)) - - with torch.no_grad(): - sample = model.decode(encoding).sample - - model.enable_xformers_memory_efficient_attention() - with torch.no_grad(): - sample_2 = model.decode(encoding).sample - - assert list(sample.shape) == [3, 3, 512, 512] - - assert torch_all_close(sample, sample_2, atol=1e-2) - - @parameterized.expand( - [ - # fmt: off - [33, [-0.3001, 0.0918, -2.6984, -3.9720, -3.2099, -5.0353, 1.7338, -0.2065, 3.4267]], - [47, [-1.5030, -4.3871, -6.0355, -9.1157, -1.6661, -2.7853, 2.1607, -5.0823, 2.5633]], - # fmt: on - ] - ) - def test_stable_diffusion_encode_sample(self, seed, expected_slice): - model = self.get_sd_vae_model() - image = self.get_sd_image(seed) - generator = self.get_generator(seed) - - with torch.no_grad(): - dist = model.encode(image).latent_dist - sample = dist.sample(generator=generator) - - assert list(sample.shape) == [image.shape[0], 4] + [i // 8 for i in image.shape[2:]] - - output_slice = sample[0, -1, -3:, -3:].flatten().cpu() - expected_output_slice = torch.tensor(expected_slice) - - tolerance = 3e-3 if torch_device != "mps" else 1e-2 - assert torch_all_close(output_slice, expected_output_slice, atol=tolerance) - - -@slow -class AsymmetricAutoencoderKLIntegrationTests(unittest.TestCase): - def get_file_format(self, seed, shape): - return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy" - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False): - dtype = torch.float16 if fp16 else torch.float32 - image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype) - return image - - def get_sd_vae_model(self, model_id="cross-attention/asymmetric-autoencoder-kl-x-1-5", fp16=False): - revision = "main" - torch_dtype = torch.float32 - - model = AsymmetricAutoencoderKL.from_pretrained( - model_id, - torch_dtype=torch_dtype, - revision=revision, - ) - model.to(torch_device).eval() - - return model - - def get_generator(self, seed=0): - generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda" - if torch_device != "mps": - return torch.Generator(device=generator_device).manual_seed(seed) - return torch.manual_seed(seed) - - @parameterized.expand( - [ - # fmt: off - [ - 33, - [-0.0336, 0.3011, 0.1764, 0.0087, -0.3401, 0.3645, -0.1247, 0.1205], - [-0.1603, 0.9878, -0.0495, -0.0790, -0.2709, 0.8375, -0.2060, -0.0824], - ], - [ - 47, - [0.4400, 0.0543, 0.2873, 0.2946, 0.0553, 0.0839, -0.1585, 0.2529], - [-0.2376, 0.1168, 0.1332, -0.4840, -0.2508, -0.0791, -0.0493, -0.4089], - ], - # fmt: on - ] - ) - def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps): - model = self.get_sd_vae_model() - image = self.get_sd_image(seed) - generator = self.get_generator(seed) - - with torch.no_grad(): - sample = model(image, generator=generator, sample_posterior=True).sample - - assert sample.shape == image.shape - - output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=5e-3) - - @parameterized.expand( - [ - # fmt: off - [ - 33, - [-0.0340, 0.2870, 0.1698, -0.0105, -0.3448, 0.3529, -0.1321, 0.1097], - [-0.0344, 0.2912, 0.1687, -0.0137, -0.3462, 0.3552, -0.1337, 0.1078], - ], - [ - 47, - [0.4397, 0.0550, 0.2873, 0.2946, 0.0567, 0.0855, -0.1580, 0.2531], - [0.4397, 0.0550, 0.2873, 0.2946, 0.0567, 0.0855, -0.1580, 0.2531], - ], - # fmt: on - ] - ) - def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps): - model = self.get_sd_vae_model() - image = self.get_sd_image(seed) - - with torch.no_grad(): - sample = model(image).sample - - assert sample.shape == image.shape - - output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=3e-3) - - @parameterized.expand( - [ - # fmt: off - [13, [-0.0521, -0.2939, 0.1540, -0.1855, -0.5936, -0.3138, -0.4579, -0.2275]], - [37, [-0.1820, -0.4345, -0.0455, -0.2923, -0.8035, -0.5089, -0.4795, -0.3106]], - # fmt: on - ] - ) - @require_torch_accelerator - @skip_mps - def test_stable_diffusion_decode(self, seed, expected_slice): - model = self.get_sd_vae_model() - encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64)) - - with torch.no_grad(): - sample = model.decode(encoding).sample - - assert list(sample.shape) == [3, 3, 512, 512] - - output_slice = sample[-1, -2:, :2, -2:].flatten().cpu() - expected_output_slice = torch.tensor(expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=2e-3) - - @parameterized.expand([(13,), (16,), (37,)]) - @require_torch_gpu - @unittest.skipIf( - not is_xformers_available(), - reason="xformers is not required when using PyTorch 2.0.", - ) - def test_stable_diffusion_decode_xformers_vs_2_0(self, seed): - model = self.get_sd_vae_model() - encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64)) - - with torch.no_grad(): - sample = model.decode(encoding).sample - - model.enable_xformers_memory_efficient_attention() - with torch.no_grad(): - sample_2 = model.decode(encoding).sample - - assert list(sample.shape) == [3, 3, 512, 512] - - assert torch_all_close(sample, sample_2, atol=5e-2) - - @parameterized.expand( - [ - # fmt: off - [33, [-0.3001, 0.0918, -2.6984, -3.9720, -3.2099, -5.0353, 1.7338, -0.2065, 3.4267]], - [47, [-1.5030, -4.3871, -6.0355, -9.1157, -1.6661, -2.7853, 2.1607, -5.0823, 2.5633]], - # fmt: on - ] - ) - def test_stable_diffusion_encode_sample(self, seed, expected_slice): - model = self.get_sd_vae_model() - image = self.get_sd_image(seed) - generator = self.get_generator(seed) - - with torch.no_grad(): - dist = model.encode(image).latent_dist - sample = dist.sample(generator=generator) - - assert list(sample.shape) == [image.shape[0], 4] + [i // 8 for i in image.shape[2:]] - - output_slice = sample[0, -1, -3:, -3:].flatten().cpu() - expected_output_slice = torch.tensor(expected_slice) - - tolerance = 3e-3 if torch_device != "mps" else 1e-2 - assert torch_all_close(output_slice, expected_output_slice, atol=tolerance) - - -@slow -class ConsistencyDecoderVAEIntegrationTests(unittest.TestCase): - def setUp(self): - # clean up the VRAM before each test - super().setUp() - gc.collect() - torch.cuda.empty_cache() - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - @torch.no_grad() - def test_encode_decode(self): - vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder") # TODO - update - vae.to(torch_device) - - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/img2img/sketch-mountains-input.jpg" - ).resize((256, 256)) - image = torch.from_numpy(np.array(image).transpose(2, 0, 1).astype(np.float32) / 127.5 - 1)[None, :, :, :].to( - torch_device - ) - - latent = vae.encode(image).latent_dist.mean - - sample = vae.decode(latent, generator=torch.Generator("cpu").manual_seed(0)).sample - - actual_output = sample[0, :2, :2, :2].flatten().cpu() - expected_output = torch.tensor([-0.0141, -0.0014, 0.0115, 0.0086, 0.1051, 0.1053, 0.1031, 0.1024]) - - assert torch_all_close(actual_output, expected_output, atol=5e-3) - - def test_sd(self): - vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder") # TODO - update - pipe = StableDiffusionPipeline.from_pretrained( - "stable-diffusion-v1-5/stable-diffusion-v1-5", vae=vae, safety_checker=None - ) - pipe.to(torch_device) - - out = pipe( - "horse", - num_inference_steps=2, - output_type="pt", - generator=torch.Generator("cpu").manual_seed(0), - ).images[0] - - actual_output = out[:2, :2, :2].flatten().cpu() - expected_output = torch.tensor([0.7686, 0.8228, 0.6489, 0.7455, 0.8661, 0.8797, 0.8241, 0.8759]) - - assert torch_all_close(actual_output, expected_output, atol=5e-3) - - def test_encode_decode_f16(self): - vae = ConsistencyDecoderVAE.from_pretrained( - "openai/consistency-decoder", torch_dtype=torch.float16 - ) # TODO - update - vae.to(torch_device) - - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/img2img/sketch-mountains-input.jpg" - ).resize((256, 256)) - image = ( - torch.from_numpy(np.array(image).transpose(2, 0, 1).astype(np.float32) / 127.5 - 1)[None, :, :, :] - .half() - .to(torch_device) - ) - - latent = vae.encode(image).latent_dist.mean - - sample = vae.decode(latent, generator=torch.Generator("cpu").manual_seed(0)).sample - - actual_output = sample[0, :2, :2, :2].flatten().cpu() - expected_output = torch.tensor( - [-0.0111, -0.0125, -0.0017, -0.0007, 0.1257, 0.1465, 0.1450, 0.1471], - dtype=torch.float16, - ) - - assert torch_all_close(actual_output, expected_output, atol=5e-3) - - def test_sd_f16(self): - vae = ConsistencyDecoderVAE.from_pretrained( - "openai/consistency-decoder", torch_dtype=torch.float16 - ) # TODO - update - pipe = StableDiffusionPipeline.from_pretrained( - "stable-diffusion-v1-5/stable-diffusion-v1-5", - torch_dtype=torch.float16, - vae=vae, - safety_checker=None, - ) - pipe.to(torch_device) - - out = pipe( - "horse", - num_inference_steps=2, - output_type="pt", - generator=torch.Generator("cpu").manual_seed(0), - ).images[0] - - actual_output = out[:2, :2, :2].flatten().cpu() - expected_output = torch.tensor( - [0.0000, 0.0249, 0.0000, 0.0000, 0.1709, 0.2773, 0.0471, 0.1035], - dtype=torch.float16, - ) - - assert torch_all_close(actual_output, expected_output, atol=5e-3) - - def test_vae_tiling(self): - vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16) - pipe = StableDiffusionPipeline.from_pretrained( - "stable-diffusion-v1-5/stable-diffusion-v1-5", vae=vae, safety_checker=None, torch_dtype=torch.float16 - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - out_1 = pipe( - "horse", - num_inference_steps=2, - output_type="pt", - generator=torch.Generator("cpu").manual_seed(0), - ).images[0] - - # make sure tiled vae decode yields the same result - pipe.enable_vae_tiling() - out_2 = pipe( - "horse", - num_inference_steps=2, - output_type="pt", - generator=torch.Generator("cpu").manual_seed(0), - ).images[0] - - assert torch_all_close(out_1, out_2, atol=5e-3) - - # test that tiled decode works with various shapes - shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)] - with torch.no_grad(): - for shape in shapes: - image = torch.zeros(shape, device=torch_device, dtype=pipe.vae.dtype) - pipe.vae.decode(image) - - -@slow -class AutoencoderOobleckIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - backend_empty_cache(torch_device) - - def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) - # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] - - return torch.nn.utils.rnn.pad_sequence( - [torch.from_numpy(x["array"]) for x in speech_samples], batch_first=True - ) - - def get_audio(self, audio_sample_size=2097152, fp16=False): - dtype = torch.float16 if fp16 else torch.float32 - audio = self._load_datasamples(2).to(torch_device).to(dtype) - - # pad / crop to audio_sample_size - audio = torch.nn.functional.pad(audio[:, :audio_sample_size], pad=(0, audio_sample_size - audio.shape[-1])) - - # todo channel - audio = audio.unsqueeze(1).repeat(1, 2, 1).to(torch_device) - - return audio - - def get_oobleck_vae_model(self, model_id="stabilityai/stable-audio-open-1.0", fp16=False): - torch_dtype = torch.float16 if fp16 else torch.float32 - - model = AutoencoderOobleck.from_pretrained( - model_id, - subfolder="vae", - torch_dtype=torch_dtype, - ) - model.to(torch_device) - - return model - - def get_generator(self, seed=0): - generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda" - if torch_device != "mps": - return torch.Generator(device=generator_device).manual_seed(seed) - return torch.manual_seed(seed) - - @parameterized.expand( - [ - # fmt: off - [33, [1.193e-4, 6.56e-05, 1.314e-4, 3.80e-05, -4.01e-06], 0.001192], - [44, [2.77e-05, -2.65e-05, 1.18e-05, -6.94e-05, -9.57e-05], 0.001196], - # fmt: on - ] - ) - def test_stable_diffusion(self, seed, expected_slice, expected_mean_absolute_diff): - model = self.get_oobleck_vae_model() - audio = self.get_audio() - generator = self.get_generator(seed) - - with torch.no_grad(): - sample = model(audio, generator=generator, sample_posterior=True).sample - - assert sample.shape == audio.shape - assert ((sample - audio).abs().mean() - expected_mean_absolute_diff).abs() <= 1e-6 - - output_slice = sample[-1, 1, 5:10].cpu() - expected_output_slice = torch.tensor(expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=1e-5) - - def test_stable_diffusion_mode(self): - model = self.get_oobleck_vae_model() - audio = self.get_audio() - - with torch.no_grad(): - sample = model(audio, sample_posterior=False).sample - - assert sample.shape == audio.shape - - @parameterized.expand( - [ - # fmt: off - [33, [1.193e-4, 6.56e-05, 1.314e-4, 3.80e-05, -4.01e-06], 0.001192], - [44, [2.77e-05, -2.65e-05, 1.18e-05, -6.94e-05, -9.57e-05], 0.001196], - # fmt: on - ] - ) - def test_stable_diffusion_encode_decode(self, seed, expected_slice, expected_mean_absolute_diff): - model = self.get_oobleck_vae_model() - audio = self.get_audio() - generator = self.get_generator(seed) - - with torch.no_grad(): - x = audio - posterior = model.encode(x).latent_dist - z = posterior.sample(generator=generator) - sample = model.decode(z).sample - - # (batch_size, latent_dim, sequence_length) - assert posterior.mean.shape == (audio.shape[0], model.config.decoder_input_channels, 1024) - - assert sample.shape == audio.shape - assert ((sample - audio).abs().mean() - expected_mean_absolute_diff).abs() <= 1e-6 - - output_slice = sample[-1, 1, 5:10].cpu() - expected_output_slice = torch.tensor(expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=1e-5) diff --git a/tests/models/autoencoders/vae.py b/tests/models/autoencoders/vae.py new file mode 100644 index 000000000000..f8055f1c1cb0 --- /dev/null +++ b/tests/models/autoencoders/vae.py @@ -0,0 +1,86 @@ +def get_autoencoder_kl_config(block_out_channels=None, norm_num_groups=None): + block_out_channels = block_out_channels or [2, 4] + norm_num_groups = norm_num_groups or 2 + init_dict = { + "block_out_channels": block_out_channels, + "in_channels": 3, + "out_channels": 3, + "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels), + "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels), + "latent_channels": 4, + "norm_num_groups": norm_num_groups, + } + return init_dict + + +def get_asym_autoencoder_kl_config(block_out_channels=None, norm_num_groups=None): + block_out_channels = block_out_channels or [2, 4] + norm_num_groups = norm_num_groups or 2 + init_dict = { + "in_channels": 3, + "out_channels": 3, + "down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels), + "down_block_out_channels": block_out_channels, + "layers_per_down_block": 1, + "up_block_types": ["UpDecoderBlock2D"] * len(block_out_channels), + "up_block_out_channels": block_out_channels, + "layers_per_up_block": 1, + "act_fn": "silu", + "latent_channels": 4, + "norm_num_groups": norm_num_groups, + "sample_size": 32, + "scaling_factor": 0.18215, + } + return init_dict + + +def get_autoencoder_tiny_config(block_out_channels=None): + block_out_channels = (len(block_out_channels) * [32]) if block_out_channels is not None else [32, 32] + init_dict = { + "in_channels": 3, + "out_channels": 3, + "encoder_block_out_channels": block_out_channels, + "decoder_block_out_channels": block_out_channels, + "num_encoder_blocks": [b // min(block_out_channels) for b in block_out_channels], + "num_decoder_blocks": [b // min(block_out_channels) for b in reversed(block_out_channels)], + } + return init_dict + + +def get_consistency_vae_config(block_out_channels=None, norm_num_groups=None): + block_out_channels = block_out_channels or [2, 4] + norm_num_groups = norm_num_groups or 2 + return { + "encoder_block_out_channels": block_out_channels, + "encoder_in_channels": 3, + "encoder_out_channels": 4, + "encoder_down_block_types": ["DownEncoderBlock2D"] * len(block_out_channels), + "decoder_add_attention": False, + "decoder_block_out_channels": block_out_channels, + "decoder_down_block_types": ["ResnetDownsampleBlock2D"] * len(block_out_channels), + "decoder_downsample_padding": 1, + "decoder_in_channels": 7, + "decoder_layers_per_block": 1, + "decoder_norm_eps": 1e-05, + "decoder_norm_num_groups": norm_num_groups, + "encoder_norm_num_groups": norm_num_groups, + "decoder_num_train_timesteps": 1024, + "decoder_out_channels": 6, + "decoder_resnet_time_scale_shift": "scale_shift", + "decoder_time_embedding_type": "learned", + "decoder_up_block_types": ["ResnetUpsampleBlock2D"] * len(block_out_channels), + "scaling_factor": 1, + "latent_channels": 4, + } + + +def get_autoencoder_oobleck_config(block_out_channels=None): + init_dict = { + "encoder_hidden_size": 12, + "decoder_channels": 12, + "decoder_input_channels": 6, + "audio_channels": 2, + "downsampling_ratios": [2, 4], + "channel_multiples": [1, 2], + } + return init_dict diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index f6ce6bda7381..a7594f2ea13f 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -858,11 +858,6 @@ def test_gradient_checkpointing_is_applied( ): if not self.model_class._supports_gradient_checkpointing: return # Skip test if model does not support gradient checkpointing - if self.model_class.__name__ in [ - "UNetSpatioTemporalConditionModel", - "AutoencoderKLTemporalDecoder", - ]: - return init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() diff --git a/tests/pipelines/controlnet_xs/test_controlnetxs.py b/tests/pipelines/controlnet_xs/test_controlnetxs.py index 007a2b0e46d7..508e5008a786 100644 --- a/tests/pipelines/controlnet_xs/test_controlnetxs.py +++ b/tests/pipelines/controlnet_xs/test_controlnetxs.py @@ -47,7 +47,7 @@ ) from diffusers.utils.torch_utils import randn_tensor -from ...models.autoencoders.test_models_vae import ( +from ...models.autoencoders.vae import ( get_asym_autoencoder_kl_config, get_autoencoder_kl_config, get_autoencoder_tiny_config, diff --git a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py b/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py index c940504d6c3e..53cb070c9be4 100644 --- a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py +++ b/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py @@ -34,7 +34,7 @@ from diffusers.utils.testing_utils import enable_full_determinism, load_image, require_torch_gpu, slow, torch_device from diffusers.utils.torch_utils import randn_tensor -from ...models.autoencoders.test_models_vae import ( +from ...models.autoencoders.vae import ( get_asym_autoencoder_kl_config, get_autoencoder_kl_config, get_autoencoder_tiny_config, diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 7ec677558059..4d2b534c9a28 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -48,7 +48,7 @@ torch_device, ) -from ..models.autoencoders.test_models_vae import ( +from ..models.autoencoders.vae import ( get_asym_autoencoder_kl_config, get_autoencoder_kl_config, get_autoencoder_tiny_config, From 9ff72433fa5a4d9f9e2f2c599e394480b581c614 Mon Sep 17 00:00:00 2001 From: fancy45daddy <124528204+fancy45daddy@users.noreply.github.com> Date: Wed, 4 Dec 2024 03:24:22 -0800 Subject: [PATCH 30/38] add torch_xla support in pipeline_stable_audio.py (#10109) Update pipeline_stable_audio.py --- .../pipelines/stable_audio/pipeline_stable_audio.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py b/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py index 4fe082d88957..a30af53f77a7 100644 --- a/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py +++ b/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py @@ -26,6 +26,7 @@ from ...models.embeddings import get_1d_rotary_pos_embed from ...schedulers import EDMDPMSolverMultistepScheduler from ...utils import ( + is_torch_xla_available, logging, replace_example_docstring, ) @@ -33,6 +34,12 @@ from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline from .modeling_stable_audio import StableAudioProjectionModel +if is_torch_xla_available(): + import torch_xla.core.xla_model as xm + + XLA_AVAILABLE = True +else: + XLA_AVAILABLE = False logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -725,6 +732,9 @@ def __call__( if callback is not None and i % callback_steps == 0: step_idx = i // getattr(self.scheduler, "order", 1) callback(step_idx, t, latents) + + if XLA_AVAILABLE: + xm.mark_step() # 9. Post-processing if not output_type == "latent": From 8a450c3da0b6a8aca1c36a4f3ea7f0096033cf56 Mon Sep 17 00:00:00 2001 From: hlky Date: Wed, 4 Dec 2024 12:17:42 +0000 Subject: [PATCH 31/38] Fix `pipeline_stable_audio` formating (#10114) --- src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py b/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py index a30af53f77a7..cef63cf7e63d 100644 --- a/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py +++ b/src/diffusers/pipelines/stable_audio/pipeline_stable_audio.py @@ -34,6 +34,7 @@ from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline from .modeling_stable_audio import StableAudioProjectionModel + if is_torch_xla_available(): import torch_xla.core.xla_model as xm @@ -732,7 +733,7 @@ def __call__( if callback is not None and i % callback_steps == 0: step_idx = i // getattr(self.scheduler, "order", 1) callback(step_idx, t, latents) - + if XLA_AVAILABLE: xm.mark_step() From e8da75dff53095fb0adc1aab4132402a2c02f569 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 4 Dec 2024 22:27:43 +0530 Subject: [PATCH 32/38] [bitsandbytes] allow directly CUDA placements of pipelines loaded with bnb components (#9840) * allow device placement when using bnb quantization. * warning. * tests * fixes * docs. * require accelerate version. * remove print. * revert to() * tests * fixes * fix: missing AutoencoderKL lora adapter (#9807) * fix: missing AutoencoderKL lora adapter * fix --------- Co-authored-by: Sayak Paul * fixes * fix condition test * updates * updates * remove is_offloaded. * fixes * better * empty --------- Co-authored-by: Emmanuel Benazera --- src/diffusers/pipelines/pipeline_utils.py | 16 +++++--- tests/quantization/bnb/test_4bit.py | 45 ++++++++++++++++++++++- tests/quantization/bnb/test_mixed_int8.py | 36 ++++++++++++++++++ 3 files changed, 91 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 5a4219adcb37..a504184ea2f2 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -66,7 +66,6 @@ if is_torch_npu_available(): import torch_npu # noqa: F401 - from .pipeline_loading_utils import ( ALL_IMPORTABLE_CLASSES, CONNECTED_PIPES_KEYS, @@ -388,6 +387,7 @@ def to(self, *args, **kwargs): ) device = device or device_arg + pipeline_has_bnb = any(any((_check_bnb_status(module))) for _, module in self.components.items()) # throw warning if pipeline is in "offloaded"-mode but user tries to manually set to GPU. def module_is_sequentially_offloaded(module): @@ -410,10 +410,16 @@ def module_is_offloaded(module): pipeline_is_sequentially_offloaded = any( module_is_sequentially_offloaded(module) for _, module in self.components.items() ) - if pipeline_is_sequentially_offloaded and device and torch.device(device).type == "cuda": - raise ValueError( - "It seems like you have activated sequential model offloading by calling `enable_sequential_cpu_offload`, but are now attempting to move the pipeline to GPU. This is not compatible with offloading. Please, move your pipeline `.to('cpu')` or consider removing the move altogether if you use sequential offloading." - ) + if device and torch.device(device).type == "cuda": + if pipeline_is_sequentially_offloaded and not pipeline_has_bnb: + raise ValueError( + "It seems like you have activated sequential model offloading by calling `enable_sequential_cpu_offload`, but are now attempting to move the pipeline to GPU. This is not compatible with offloading. Please, move your pipeline `.to('cpu')` or consider removing the move altogether if you use sequential offloading." + ) + # PR: https://github.com/huggingface/accelerate/pull/3223/ + elif pipeline_has_bnb and is_accelerate_version("<", "1.1.0.dev0"): + raise ValueError( + "You are trying to call `.to('cuda')` on a pipeline that has models quantized with `bitsandbytes`. Your current `accelerate` installation does not support it. Please upgrade the installation." + ) is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1 if is_pipeline_device_mapped: diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py index b548b03be31d..1e631114f038 100644 --- a/tests/quantization/bnb/test_4bit.py +++ b/tests/quantization/bnb/test_4bit.py @@ -18,10 +18,11 @@ import unittest import numpy as np +import pytest import safetensors.torch from diffusers import BitsAndBytesConfig, DiffusionPipeline, FluxTransformer2DModel, SD3Transformer2DModel -from diffusers.utils import logging +from diffusers.utils import is_accelerate_version, logging from diffusers.utils.testing_utils import ( CaptureLogger, is_bitsandbytes_available, @@ -47,6 +48,7 @@ def get_some_linear_layer(model): if is_transformers_available(): + from transformers import BitsAndBytesConfig as BnbConfig from transformers import T5EncoderModel if is_torch_available(): @@ -483,6 +485,47 @@ def test_moving_to_cpu_throws_warning(self): assert "Pipelines loaded with `dtype=torch.float16`" in cap_logger.out + @pytest.mark.xfail( + condition=is_accelerate_version("<=", "1.1.1"), + reason="Test will pass after https://github.com/huggingface/accelerate/pull/3223 is in a release.", + strict=True, + ) + def test_pipeline_cuda_placement_works_with_nf4(self): + transformer_nf4_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + ) + transformer_4bit = SD3Transformer2DModel.from_pretrained( + self.model_name, + subfolder="transformer", + quantization_config=transformer_nf4_config, + torch_dtype=torch.float16, + ) + text_encoder_3_nf4_config = BnbConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + ) + text_encoder_3_4bit = T5EncoderModel.from_pretrained( + self.model_name, + subfolder="text_encoder_3", + quantization_config=text_encoder_3_nf4_config, + torch_dtype=torch.float16, + ) + # CUDA device placement works. + pipeline_4bit = DiffusionPipeline.from_pretrained( + self.model_name, + transformer=transformer_4bit, + text_encoder_3=text_encoder_3_4bit, + torch_dtype=torch.float16, + ).to("cuda") + + # Check if inference works. + _ = pipeline_4bit("table", max_sequence_length=20, num_inference_steps=2) + + del pipeline_4bit + @require_transformers_version_greater("4.44.0") class SlowBnb4BitFluxTests(Base4bitTests): diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index a67e8d38e961..f474a1d4f4d0 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -17,8 +17,10 @@ import unittest import numpy as np +import pytest from diffusers import BitsAndBytesConfig, DiffusionPipeline, FluxTransformer2DModel, SD3Transformer2DModel, logging +from diffusers.utils import is_accelerate_version from diffusers.utils.testing_utils import ( CaptureLogger, is_bitsandbytes_available, @@ -44,6 +46,7 @@ def get_some_linear_layer(model): if is_transformers_available(): + from transformers import BitsAndBytesConfig as BnbConfig from transformers import T5EncoderModel if is_torch_available(): @@ -432,6 +435,39 @@ def test_generate_quality_dequantize(self): output_type="np", ).images + @pytest.mark.xfail( + condition=is_accelerate_version("<=", "1.1.1"), + reason="Test will pass after https://github.com/huggingface/accelerate/pull/3223 is in a release.", + strict=True, + ) + def test_pipeline_cuda_placement_works_with_mixed_int8(self): + transformer_8bit_config = BitsAndBytesConfig(load_in_8bit=True) + transformer_8bit = SD3Transformer2DModel.from_pretrained( + self.model_name, + subfolder="transformer", + quantization_config=transformer_8bit_config, + torch_dtype=torch.float16, + ) + text_encoder_3_8bit_config = BnbConfig(load_in_8bit=True) + text_encoder_3_8bit = T5EncoderModel.from_pretrained( + self.model_name, + subfolder="text_encoder_3", + quantization_config=text_encoder_3_8bit_config, + torch_dtype=torch.float16, + ) + # CUDA device placement works. + pipeline_8bit = DiffusionPipeline.from_pretrained( + self.model_name, + transformer=transformer_8bit, + text_encoder_3=text_encoder_3_8bit, + torch_dtype=torch.float16, + ).to("cuda") + + # Check if inference works. + _ = pipeline_8bit("table", max_sequence_length=20, num_inference_steps=2) + + del pipeline_8bit + @require_transformers_version_greater("4.44.0") class SlowBnb8bitFluxTests(Base8bitTests): From 25ddc7945bb0f133b65518edfc789c1d9ca61be2 Mon Sep 17 00:00:00 2001 From: Parag Ekbote Date: Wed, 4 Dec 2024 22:34:31 +0530 Subject: [PATCH 33/38] Fix Broken Links in ReadMe (#10117) Update broken links in ReadME. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index afecd64d9521..dac3b3598aaf 100644 --- a/README.md +++ b/README.md @@ -112,8 +112,8 @@ Check out the [Quickstart](https://huggingface.co/docs/diffusers/quicktour) to l | **Documentation** | **What can I learn?** | |---------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | [Tutorial](https://huggingface.co/docs/diffusers/tutorials/tutorial_overview) | A basic crash course for learning how to use the library's most important features like using models and schedulers to build your own diffusion system, and training your own diffusion model. | -| [Loading](https://huggingface.co/docs/diffusers/using-diffusers/loading_overview) | Guides for how to load and configure all the components (pipelines, models, and schedulers) of the library, as well as how to use different schedulers. | -| [Pipelines for inference](https://huggingface.co/docs/diffusers/using-diffusers/pipeline_overview) | Guides for how to use pipelines for different inference tasks, batched generation, controlling generated outputs and randomness, and how to contribute a pipeline to the library. | +| [Loading](https://huggingface.co/docs/diffusers/using-diffusers/loading) | Guides for how to load and configure all the components (pipelines, models, and schedulers) of the library, as well as how to use different schedulers. | +| [Pipelines for inference](https://huggingface.co/docs/diffusers/using-diffusers/overview_techniques) | Guides for how to use pipelines for different inference tasks, batched generation, controlling generated outputs and randomness, and how to contribute a pipeline to the library. | | [Optimization](https://huggingface.co/docs/diffusers/optimization/fp16) | Guides for how to optimize your diffusion model to run faster and consume less memory. | | [Training](https://huggingface.co/docs/diffusers/training/overview) | Guides for how to train a diffusion model for different tasks with different training techniques. | ## Contribution From a2d424eb2ed2be3f1d77ad9a5a1f309825c6c863 Mon Sep 17 00:00:00 2001 From: hlky Date: Wed, 4 Dec 2024 18:42:47 +0000 Subject: [PATCH 34/38] Add `sigmas` to pipelines using FlowMatch (#10116) --- .../pipelines/aura_flow/pipeline_aura_flow.py | 9 +-------- .../pipeline_stable_diffusion_3_controlnet.py | 12 ++++++------ ...eline_stable_diffusion_3_controlnet_inpainting.py | 12 ++++++------ src/diffusers/pipelines/lumina/pipeline_lumina.py | 9 +-------- src/diffusers/pipelines/pag/pipeline_pag_sd_3.py | 12 ++++++------ .../pipelines/pag/pipeline_pag_sd_3_img2img.py | 12 ++++++------ .../pipeline_stable_diffusion_3.py | 12 ++++++------ .../pipeline_stable_diffusion_3_img2img.py | 12 ++++++------ .../pipeline_stable_diffusion_3_inpaint.py | 12 ++++++------ 9 files changed, 44 insertions(+), 58 deletions(-) diff --git a/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py b/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py index 58eaf6b46d0a..8737b219c833 100644 --- a/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py +++ b/src/diffusers/pipelines/aura_flow/pipeline_aura_flow.py @@ -387,7 +387,6 @@ def __call__( prompt: Union[str, List[str]] = None, negative_prompt: Union[str, List[str]] = None, num_inference_steps: int = 50, - timesteps: List[int] = None, sigmas: List[float] = None, guidance_scale: float = 3.5, num_images_per_prompt: Optional[int] = 1, @@ -424,10 +423,6 @@ def __call__( sigmas (`List[float]`, *optional*): Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, `num_inference_steps` and `timesteps` must be `None`. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. guidance_scale (`float`, *optional*, defaults to 5.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -522,9 +517,7 @@ def __call__( # 4. Prepare timesteps # sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) - timesteps, num_inference_steps = retrieve_timesteps( - self.scheduler, num_inference_steps, device, timesteps, sigmas - ) + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas) # 5. Prepare latents. latent_channels = self.transformer.config.in_channels diff --git a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py index 8fd07fafc766..983fff307755 100644 --- a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py +++ b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet.py @@ -733,7 +733,7 @@ def __call__( height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 28, - timesteps: List[int] = None, + sigmas: Optional[List[float]] = None, guidance_scale: float = 7.0, control_guidance_start: Union[float, List[float]] = 0.0, control_guidance_end: Union[float, List[float]] = 1.0, @@ -778,10 +778,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 5.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -998,7 +998,7 @@ def __call__( assert False # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas) num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) self._num_timesteps = len(timesteps) diff --git a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py index 437bb9f2f182..5d5249922f8d 100644 --- a/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py +++ b/src/diffusers/pipelines/controlnet_sd3/pipeline_stable_diffusion_3_controlnet_inpainting.py @@ -787,7 +787,7 @@ def __call__( height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 28, - timesteps: List[int] = None, + sigmas: Optional[List[float]] = None, guidance_scale: float = 7.0, control_guidance_start: Union[float, List[float]] = 0.0, control_guidance_end: Union[float, List[float]] = 1.0, @@ -833,10 +833,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 5.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -1033,7 +1033,7 @@ def __call__( controlnet_pooled_projections = controlnet_pooled_projections or pooled_prompt_embeds # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas) num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) self._num_timesteps = len(timesteps) diff --git a/src/diffusers/pipelines/lumina/pipeline_lumina.py b/src/diffusers/pipelines/lumina/pipeline_lumina.py index 018f2e8bf1bc..0a59d98919f0 100644 --- a/src/diffusers/pipelines/lumina/pipeline_lumina.py +++ b/src/diffusers/pipelines/lumina/pipeline_lumina.py @@ -617,7 +617,6 @@ def __call__( width: Optional[int] = None, height: Optional[int] = None, num_inference_steps: int = 30, - timesteps: List[int] = None, guidance_scale: float = 4.0, negative_prompt: Union[str, List[str]] = None, sigmas: List[float] = None, @@ -649,10 +648,6 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 30): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. sigmas (`List[float]`, *optional*): Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed @@ -776,9 +771,7 @@ def __call__( prompt_attention_mask = torch.cat([prompt_attention_mask, negative_prompt_attention_mask], dim=0) # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps( - self.scheduler, num_inference_steps, device, timesteps, sigmas - ) + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas) # 5. Prepare latents. latent_channels = self.transformer.config.in_channels diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py index c6f9077ad3da..d1b96e75574f 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_3.py @@ -693,7 +693,7 @@ def __call__( height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 28, - timesteps: List[int] = None, + sigmas: Optional[List[float]] = None, guidance_scale: float = 7.0, negative_prompt: Optional[Union[str, List[str]]] = None, negative_prompt_2: Optional[Union[str, List[str]]] = None, @@ -735,10 +735,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -890,7 +890,7 @@ def __call__( pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0) # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas) num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) self._num_timesteps = len(timesteps) diff --git a/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py b/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py index 54e37e0fd286..01d29867dea3 100644 --- a/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py +++ b/src/diffusers/pipelines/pag/pipeline_pag_sd_3_img2img.py @@ -733,7 +733,7 @@ def __call__( image: PipelineImageInput = None, strength: float = 0.6, num_inference_steps: int = 50, - timesteps: List[int] = None, + sigmas: Optional[List[float]] = None, guidance_scale: float = 7.0, negative_prompt: Optional[Union[str, List[str]]] = None, negative_prompt_2: Optional[Union[str, List[str]]] = None, @@ -783,10 +783,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -936,7 +936,7 @@ def __call__( image = self.image_processor.preprocess(image) # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas) timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) # 5. Prepare latent variables diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py index aee1ad8c75f5..513f86441c3a 100644 --- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py +++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py @@ -679,7 +679,7 @@ def __call__( height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 28, - timesteps: List[int] = None, + sigmas: Optional[List[float]] = None, guidance_scale: float = 7.0, negative_prompt: Optional[Union[str, List[str]]] = None, negative_prompt_2: Optional[Union[str, List[str]]] = None, @@ -723,10 +723,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -883,7 +883,7 @@ def __call__( pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0) # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas) num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) self._num_timesteps = len(timesteps) diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py index a07a056ec851..c91b4ee80eaa 100644 --- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_img2img.py @@ -713,7 +713,7 @@ def __call__( image: PipelineImageInput = None, strength: float = 0.6, num_inference_steps: int = 50, - timesteps: List[int] = None, + sigmas: Optional[List[float]] = None, guidance_scale: float = 7.0, negative_prompt: Optional[Union[str, List[str]]] = None, negative_prompt_2: Optional[Union[str, List[str]]] = None, @@ -753,10 +753,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -893,7 +893,7 @@ def __call__( image = self.image_processor.preprocess(image) # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas) timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) diff --git a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py index d3e0ecf9c3a7..43cb9e5ad0b6 100644 --- a/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3_inpaint.py @@ -806,7 +806,7 @@ def __call__( padding_mask_crop: Optional[int] = None, strength: float = 0.6, num_inference_steps: int = 50, - timesteps: List[int] = None, + sigmas: Optional[List[float]] = None, guidance_scale: float = 7.0, negative_prompt: Optional[Union[str, List[str]]] = None, negative_prompt_2: Optional[Union[str, List[str]]] = None, @@ -874,10 +874,10 @@ def __call__( num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. guidance_scale (`float`, *optional*, defaults to 7.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -1007,7 +1007,7 @@ def __call__( pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0) # 3. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas) timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) # check that number of inference steps is not < 1 - as this doesn't make sense if num_inference_steps < 1: From 04bba387257822f21fb54aba90bc328e27468f42 Mon Sep 17 00:00:00 2001 From: Linoy Tsaban <57615435+linoytsaban@users.noreply.github.com> Date: Wed, 4 Dec 2024 20:48:32 +0200 Subject: [PATCH 35/38] [Flux Redux] add prompt & multiple image input (#10056) * add multiple prompts to flux redux --------- Co-authored-by: hlky --- .../flux/pipeline_flux_prior_redux.py | 97 ++++++++++++++++++- 1 file changed, 92 insertions(+), 5 deletions(-) diff --git a/src/diffusers/pipelines/flux/pipeline_flux_prior_redux.py b/src/diffusers/pipelines/flux/pipeline_flux_prior_redux.py index cf50e89ca5ae..f53958df2ed0 100644 --- a/src/diffusers/pipelines/flux/pipeline_flux_prior_redux.py +++ b/src/diffusers/pipelines/flux/pipeline_flux_prior_redux.py @@ -142,6 +142,45 @@ def __init__( self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77 ) + def check_inputs( + self, + image, + prompt, + prompt_2, + prompt_embeds=None, + pooled_prompt_embeds=None, + prompt_embeds_scale=1.0, + pooled_prompt_embeds_scale=1.0, + ): + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt_2 is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)): + raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}") + if prompt is not None and (isinstance(prompt, list) and isinstance(image, list) and len(prompt) != len(image)): + raise ValueError( + f"number of prompts must be equal to number of images, but {len(prompt)} prompts were provided and {len(image)} images" + ) + if prompt_embeds is not None and pooled_prompt_embeds is None: + raise ValueError( + "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`." + ) + if isinstance(prompt_embeds_scale, list) and ( + isinstance(image, list) and len(prompt_embeds_scale) != len(image) + ): + raise ValueError( + f"number of weights must be equal to number of images, but {len(prompt_embeds_scale)} weights were provided and {len(image)} images" + ) + def encode_image(self, image, device, num_images_per_prompt): dtype = next(self.image_encoder.parameters()).dtype image = self.feature_extractor.preprocess( @@ -334,6 +373,12 @@ def encode_prompt( def __call__( self, image: PipelineImageInput, + prompt: Union[str, List[str]] = None, + prompt_2: Optional[Union[str, List[str]]] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + pooled_prompt_embeds: Optional[torch.FloatTensor] = None, + prompt_embeds_scale: Optional[Union[float, List[float]]] = 1.0, + pooled_prompt_embeds_scale: Optional[Union[float, List[float]]] = 1.0, return_dict: bool = True, ): r""" @@ -345,6 +390,16 @@ def __call__( numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. **experimental feature**: to use this feature, + make sure to explicitly load text encoders to the pipeline. Prompts will be ignored if text encoders + are not loaded. + prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + pooled_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated pooled text embeddings. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.flux.FluxPriorReduxPipelineOutput`] instead of a plain tuple. @@ -356,6 +411,17 @@ def __call__( returning a tuple, the first element is a list with the generated images. """ + # 1. Check inputs. Raise error if not correct + self.check_inputs( + image, + prompt, + prompt_2, + prompt_embeds=prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + prompt_embeds_scale=prompt_embeds_scale, + pooled_prompt_embeds_scale=pooled_prompt_embeds_scale, + ) + # 2. Define call parameters if image is not None and isinstance(image, Image.Image): batch_size = 1 @@ -363,6 +429,13 @@ def __call__( batch_size = len(image) else: batch_size = image.shape[0] + if prompt is not None and isinstance(prompt, str): + prompt = batch_size * [prompt] + if isinstance(prompt_embeds_scale, float): + prompt_embeds_scale = batch_size * [prompt_embeds_scale] + if isinstance(pooled_prompt_embeds_scale, float): + pooled_prompt_embeds_scale = batch_size * [pooled_prompt_embeds_scale] + device = self._execution_device # 3. Prepare image embeddings @@ -378,24 +451,38 @@ def __call__( pooled_prompt_embeds, _, ) = self.encode_prompt( - prompt=[""] * batch_size, - prompt_2=None, - prompt_embeds=None, - pooled_prompt_embeds=None, + prompt=prompt, + prompt_2=prompt_2, + prompt_embeds=prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, device=device, num_images_per_prompt=1, max_sequence_length=512, lora_scale=None, ) else: + if prompt is not None: + logger.warning( + "prompt input is ignored when text encoders are not loaded to the pipeline. " + "Make sure to explicitly load the text encoders to enable prompt input. " + ) # max_sequence_length is 512, t5 encoder hidden size is 4096 prompt_embeds = torch.zeros((batch_size, 512, 4096), device=device, dtype=image_embeds.dtype) # pooled_prompt_embeds is 768, clip text encoder hidden size pooled_prompt_embeds = torch.zeros((batch_size, 768), device=device, dtype=image_embeds.dtype) - # Concatenate image and text embeddings + # scale & concatenate image and text embeddings prompt_embeds = torch.cat([prompt_embeds, image_embeds], dim=1) + prompt_embeds *= torch.tensor(prompt_embeds_scale, device=device, dtype=image_embeds.dtype)[:, None, None] + pooled_prompt_embeds *= torch.tensor(pooled_prompt_embeds_scale, device=device, dtype=image_embeds.dtype)[ + :, None + ] + + # weighted sum + prompt_embeds = torch.sum(prompt_embeds, dim=0, keepdim=True) + pooled_prompt_embeds = torch.sum(pooled_prompt_embeds, dim=0, keepdim=True) + # Offload all models self.maybe_free_model_hooks() From 73dac0c49e998f622c3315dce8b6a6e7a4107258 Mon Sep 17 00:00:00 2001 From: zhangp365 <144313702+zhangp365@users.noreply.github.com> Date: Thu, 5 Dec 2024 08:03:43 +0800 Subject: [PATCH 36/38] Fix a bug in the state dict judgment in ip_adapter.py. (#10095) * fix a judging state dict bug in ip_adapter.py * make --------- Co-authored-by: hlky --- src/diffusers/loaders/ip_adapter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py index c96cb21f78b3..ca460f948e6f 100644 --- a/src/diffusers/loaders/ip_adapter.py +++ b/src/diffusers/loaders/ip_adapter.py @@ -187,7 +187,7 @@ def load_ip_adapter( state_dict = pretrained_model_name_or_path_or_dict keys = list(state_dict.keys()) - if keys != ["image_proj", "ip_adapter"]: + if "image_proj" not in keys and "ip_adapter" not in keys: raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing from the state dict.") state_dicts.append(state_dict) From 96220390a2c060fd47b2c293eaf25c25e132636b Mon Sep 17 00:00:00 2001 From: linjiapro Date: Wed, 4 Dec 2024 16:20:05 -0800 Subject: [PATCH 37/38] Fix a bug for SD35 control net training and improve control net block index (#10065) * wip --------- Co-authored-by: YiYi Xu Co-authored-by: Sayak Paul --- .../models/controlnets/controlnet_sd3.py | 20 ++++++++++++------- .../models/transformers/transformer_sd3.py | 4 +--- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/diffusers/models/controlnets/controlnet_sd3.py b/src/diffusers/models/controlnets/controlnet_sd3.py index 4f3253d82f3d..9e361f2b16e5 100644 --- a/src/diffusers/models/controlnets/controlnet_sd3.py +++ b/src/diffusers/models/controlnets/controlnet_sd3.py @@ -393,13 +393,19 @@ def custom_forward(*inputs): return custom_forward ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} - encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(block), - hidden_states, - encoder_hidden_states, - temb, - **ckpt_kwargs, - ) + if self.context_embedder is not None: + encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(block), + hidden_states, + encoder_hidden_states, + temb, + **ckpt_kwargs, + ) + else: + # SD3.5 8b controlnet use single transformer block, which does not use `encoder_hidden_states` + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(block), hidden_states, temb, **ckpt_kwargs + ) else: if self.context_embedder is not None: diff --git a/src/diffusers/models/transformers/transformer_sd3.py b/src/diffusers/models/transformers/transformer_sd3.py index a1ce9a2412c5..887e8afd2106 100644 --- a/src/diffusers/models/transformers/transformer_sd3.py +++ b/src/diffusers/models/transformers/transformer_sd3.py @@ -15,7 +15,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union -import numpy as np import torch import torch.nn as nn import torch.nn.functional as F @@ -424,8 +423,7 @@ def custom_forward(*inputs): # controlnet residual if block_controlnet_hidden_states is not None and block.context_pre_only is False: interval_control = len(self.transformer_blocks) / len(block_controlnet_hidden_states) - interval_control = int(np.ceil(interval_control)) - hidden_states = hidden_states + block_controlnet_hidden_states[index_block // interval_control] + hidden_states = hidden_states + block_controlnet_hidden_states[int(index_block / interval_control)] hidden_states = self.norm_out(hidden_states, temb) hidden_states = self.proj_out(hidden_states) From 243d9a49864ebb4562de6304a5fb9b9ebb496c6e Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Wed, 4 Dec 2024 14:22:36 -1000 Subject: [PATCH 38/38] pass attn mask arg for flux (#10122) --- src/diffusers/models/attention_processor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 7351801368dd..13d910db6135 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -1908,7 +1908,9 @@ def __call__( query = apply_rotary_emb(query, image_rotary_emb) key = apply_rotary_emb(key, image_rotary_emb) - hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False) + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) hidden_states = hidden_states.to(query.dtype)