Skquark · Skquark · Oct 3, 2023 · Sep 26, 2023 · Sep 26, 2023 · Sep 26, 2023
diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml
@@ -26,6 +26,7 @@ jobs:
         image-name:
           - diffusers-pytorch-cpu
           - diffusers-pytorch-cuda
+          - diffusers-pytorch-compile-cuda
           - diffusers-flax-cpu
           - diffusers-flax-tpu
           - diffusers-onnxruntime-cpu

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
@@ -74,11 +74,11 @@ jobs:
       env:
         HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
         # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-        CUBLAS_WORKSPACE_CONFIG: :16:8 
+        CUBLAS_WORKSPACE_CONFIG: :16:8
 
       run: |
         python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "not Flax and not Onnx" \
+          -s -v -k "not Flax and not Onnx and not compile" \
           --make-reports=tests_${{ matrix.config.report }} \
           tests/
 
@@ -113,6 +113,50 @@ jobs:
         name: ${{ matrix.config.report }}_test_reports
         path: reports
 
+  run_torch_compile_tests:
+    name: PyTorch Compile CUDA tests
+
+    runs-on: docker-gpu
+
+    container:
+      image: diffusers/diffusers-pytorch-compile-cuda
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: NVIDIA-SMI
+      run: |
+        nvidia-smi
+
+    - name: Install dependencies
+      run: |
+        python -m pip install -e .[quality,test,training]
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
+    - name: Run example tests on GPU
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_torch_compile_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: torch_compile_test_reports
+        path: reports
+
   run_examples_tests:
     name: Examples PyTorch CUDA tests on Ubuntu
 

diff --git a/docker/diffusers-pytorch-compile-cuda/Dockerfile b/docker/diffusers-pytorch-compile-cuda/Dockerfile
@@ -0,0 +1,48 @@
+FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   git-lfs \
+                   curl \
+                   ca-certificates \
+                   libsndfile1-dev \
+                   libgl1 \
+                   python3.9 \
+                   python3.9-dev \
+                   python3-pip \
+                   python3.9-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3.9 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+RUN python3.9 -m pip install --no-cache-dir --upgrade pip && \
+    python3.9 -m pip install --no-cache-dir \
+        torch \
+        torchvision \
+        torchaudio \
+        invisible_watermark && \
+    python3.9 -m pip install --no-cache-dir \
+        accelerate \
+        datasets \
+        hf-doc-builder \
+        huggingface-hub \
+        Jinja2 \
+        librosa \
+        numpy \
+        scipy \
+        tensorboard \
+        transformers \
+        omegaconf \
+        pytorch-lightning \
+        xformers
+
+CMD ["/bin/bash"]
diff --git a/docs/source/en/optimization/memory.md b/docs/source/en/optimization/memory.md
@@ -321,21 +321,9 @@ with torch.inference_mode():
 
 Recent work on optimizing bandwidth in the attention block has generated huge speed-ups and reductions in GPU memory usage. The most recent type of memory-efficient attention is [Flash Attention](https://arxiv.org/pdf/2205.14135.pdf) (you can check out the original code at [HazyResearch/flash-attention](https://github.com/HazyResearch/flash-attention)).
 
-The table below details the speed-ups from a few different Nvidia GPUs when running inference on image sizes of 512x512 and a batch size of 1 (one prompt):
-
-| GPU              | base attention (fp16) | memory-efficient attention (fp16) |
-|------------------|-----------------------|-----------------------------------|
-| NVIDIA Tesla T4  |               3.5it/s |                           5.5it/s |
-| NVIDIA 3060 RTX  |               4.6it/s |                           7.8it/s |
-| NVIDIA A10G      |              8.88it/s |                          15.6it/s |
-| NVIDIA RTX A6000 |              11.7it/s |                         21.09it/s |
-| NVIDIA TITAN RTX |             12.51it/s |                         18.22it/s |
-| A100-SXM4-40GB   |              18.6it/s |                           29.it/s |
-| A100-SXM-80GB    |              18.7it/s |                          29.5it/s |
-
-<Tip warning={true}>
+<Tip>
 
-If you have PyTorch 2.0 installed, you shouldn't use xFormers!
+If you have PyTorch >= 2.0 installed, you should not expect a speed-up for inference when enabling `xformers`.
 
 </Tip>
 
@@ -365,3 +353,5 @@ with torch.inference_mode():
 # optional: You can disable it via
 # pipe.disable_xformers_memory_efficient_attention()
 ```
+
+The iteration speed when using `xformers` should match the iteration speed of Torch 2.0 as described [here](torch2.0).
diff --git a/docs/source/en/optimization/torch2.0.md b/docs/source/en/optimization/torch2.0.md
@@ -276,6 +276,7 @@ In the following tables, we report our findings in terms of the *number of itera
 | SD - inpaint | 22.24 | 23.23 | 43.76 | 49.25 |
 | SD - controlnet | 15.02 | 15.82 | 32.13 | 36.08 |
 | IF | 20.21 / <br>13.84 / <br>24.00 | 20.12 / <br>13.70 / <br>24.03 | ❌ | 97.34 / <br>27.23 / <br>111.66 |
+| SDXL - txt2img | 8.64 | 9.9 | - | - |
 
 ### A100 (batch size: 4)
 
@@ -286,6 +287,7 @@ In the following tables, we report our findings in terms of the *number of itera
 | SD - inpaint | 11.67 | 13.31 | 14.88 | 17.48 |
 | SD - controlnet | 8.28 | 9.38 | 10.51 | 12.41 |
 | IF | 25.02 | 18.04 | ❌ | 48.47 |
+| SDXL - txt2img | 2.44 | 2.74 | - | - |
 
 ### A100 (batch size: 16)
 
@@ -296,6 +298,7 @@ In the following tables, we report our findings in terms of the *number of itera
 | SD - inpaint | 3.04 | 3.66 | 3.9 | 4.76 |
 | SD - controlnet | 2.15 | 2.58 | 2.74 | 3.35 |
 | IF | 8.78 | 9.82 | ❌ | 16.77 |
+| SDXL - txt2img | 0.64 | 0.72 | - | - |
 
 ### V100 (batch size: 1)
 
@@ -336,6 +339,7 @@ In the following tables, we report our findings in terms of the *number of itera
 | SD - inpaint | 6.91 | 6.7 | 7.01 | 7.37 |
 | SD - controlnet | 4.89 | 4.86 | 5.35 | 5.48 |
 | IF | 17.42 / <br>2.47 / <br>18.52 | 16.96 / <br>2.45 / <br>18.69 | ❌ | 24.63 / <br>2.47 / <br>23.39 |
+| SDXL - txt2img | 1.15 | 1.16 | - | - |
 
 ### T4 (batch size: 4)
 
@@ -346,6 +350,7 @@ In the following tables, we report our findings in terms of the *number of itera
 | SD - inpaint | 1.81 | 1.82 | 2.09 | 2.09 |
 | SD - controlnet | 1.34 | 1.27 | 1.47 | 1.46 |
 | IF | 5.79 |  5.61 | ❌ | 7.39 |
+| SDXL - txt2img | 0.288 | 0.289 | - | - |
 
 ### T4 (batch size: 16)
 
@@ -356,6 +361,7 @@ In the following tables, we report our findings in terms of the *number of itera
 | SD - inpaint | 2.30s | 2.26s | OOM after 2nd iteration | 1.95s |
 | SD - controlnet | OOM after 2nd iteration | OOM after 2nd iteration | OOM after warmup | OOM after warmup |
 | IF * | 1.44 | 1.44 | ❌ | 1.94 |
+| SDXL - txt2img | OOM | OOM | - | - |
 
 ### RTX 3090 (batch size: 1)
 
@@ -396,6 +402,7 @@ In the following tables, we report our findings in terms of the *number of itera
 | SD - inpaint | 40.51 | 41.88 | 44.58 | 49.72 |
 | SD - controlnet | 29.27 | 30.29 | 32.26 | 36.03 |
 | IF | 69.71 / <br>18.78 / <br>85.49 | 69.13 / <br>18.80 / <br>85.56 | ❌ | 124.60 / <br>26.37 / <br>138.79 |
+| SDXL - txt2img | 6.8 | 8.18 | - | - |
 
 ### RTX 4090 (batch size: 4)
 
@@ -406,6 +413,7 @@ In the following tables, we report our findings in terms of the *number of itera
 | SD - inpaint | 12.65 | 12.81 | 15.3 | 15.58 |
 | SD - controlnet | 9.1 | 9.25 | 11.03 | 11.22 |
 | IF | 31.88 | 31.14 | ❌ | 43.92 |
+| SDXL - txt2img | 2.19 | 2.35 | - | - |
 
 ### RTX 4090 (batch size: 16)
 
@@ -416,10 +424,11 @@ In the following tables, we report our findings in terms of the *number of itera
 | SD - inpaint | 3.17 | 3.2 | 3.85 | 3.85 |
 | SD - controlnet | 2.23 | 2.3 | 2.7 | 2.75 |
 | IF | 9.26 | 9.2 | ❌ | 13.31 |
+| SDXL - txt2img | 0.52 | 0.53 | - | - |
 
 ## Notes 
 
 * Follow this [PR](https://github.com/huggingface/diffusers/pull/3313) for more details on the environment used for conducting the benchmarks. 
 * For the DeepFloyd IF pipeline where batch sizes > 1, we only used a batch size of > 1 in the first IF pipeline for text-to-image generation and NOT for upscaling. That means the two upscaling pipelines received a batch size of 1.
 
-*Thanks to [Horace He](https://github.com/Chillee) from the PyTorch team for their support in improving our support of `torch.compile()` in Diffusers.*
+*Thanks to [Horace He](https://github.com/Chillee) from the PyTorch team for their support in improving our support of `torch.compile()` in Diffusers.*