From 29798efaae3b315f0140f48651aa20216865e719 Mon Sep 17 00:00:00 2001
From: vbuterin <v@buterin.com>
Date: Fri, 14 Jun 2024 10:11:10 +0200
Subject: [PATCH] Add an option to do the final step on a CPU

I needed to do this to be able to run the model on a 4070 (with 8 GM RAM).
---
 stable_audio_tools/inference/generation.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/stable_audio_tools/inference/generation.py b/stable_audio_tools/inference/generation.py
index 843ab4b7..281c7bb2 100644
--- a/stable_audio_tools/inference/generation.py
+++ b/stable_audio_tools/inference/generation.py
@@ -104,6 +104,7 @@ def generate_diffusion_cond(
         init_noise_level: float = 1.0,
         mask_args: dict = None,
         return_latents = False,
+        cpu_final_step:bool = False,
         **sampler_kwargs
         ) -> torch.Tensor: 
     """
@@ -235,6 +236,8 @@ def generate_diffusion_cond(
 
     # v-diffusion: 
     #sampled = sample(model.model, noise, steps, 0, **conditioning_tensors, embedding_scale=cfg_scale)
+    if cpu_final_step:
+        model.to('cpu')
     del noise
     del conditioning_tensors
     del conditioning_inputs
@@ -244,6 +247,8 @@ def generate_diffusion_cond(
     if model.pretransform is not None and not return_latents:
         #cast sampled latents to pretransform dtype
         sampled = sampled.to(next(model.pretransform.parameters()).dtype)
+        if cpu_final_step:
+            sampled = sampled.to('cpu')
         sampled = model.pretransform.decode(sampled)
 
     # Return audio