diff --git a/generation/3d_ddpm/3d_ddpm_tutorial.ipynb b/generation/3d_ddpm/3d_ddpm_tutorial.ipynb new file mode 100644 index 000000000..c5067fc72 --- /dev/null +++ b/generation/3d_ddpm/3d_ddpm_tutorial.ipynb @@ -0,0 +1,795 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6286986e", + "metadata": {}, + "source": [ + "Copyright (c) MONAI Consortium \n", + "Licensed under the Apache License, Version 2.0 (the \"License\"); \n", + "you may not use this file except in compliance with the License. \n", + "You may obtain a copy of the License at \n", + "    http://www.apache.org/licenses/LICENSE-2.0 \n", + "Unless required by applicable law or agreed to in writing, software \n", + "distributed under the License is distributed on an \"AS IS\" BASIS, \n", + "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. \n", + "See the License for the specific language governing permissions and \n", + "limitations under the License.\n", + "\n", + "# Denoising Diffusion Probabilistic Model on 3D data\n", + "\n", + "This tutorial illustrates how to use MONAI for training a denoising diffusion probabilistic model (DDPM)[1] to create synthetic 3D images.\n", + "\n", + "[1] - [Ho et al. \"Denoising Diffusion Probabilistic Models\"](https://arxiv.org/abs/2006.11239)\n", + "\n", + "\n", + "## Setup environment" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f96b6f31", + "metadata": {}, + "outputs": [], + "source": [ + "!python -c \"import monai\" || pip install -q \"monai-weekly[nibabel, tqdm]\"\n", + "!python -c \"import matplotlib\" || pip install -q matplotlib\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "id": "cbc01d24", + "metadata": {}, + "source": [ + "## Setup imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cdea37d5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MONAI version: 1.4.0\n", + "Numpy version: 1.26.4\n", + "Pytorch version: 2.4.0+cu121\n", + "MONAI flags: HAS_EXT = False, USE_COMPILED = False, USE_META_DICT = False\n", + "MONAI rev id: 46a5272196a6c2590ca2589029eed8e4d56ff008\n", + "MONAI __file__: /home//miniconda3/envs/monai/lib/python3.9/site-packages/monai/__init__.py\n", + "\n", + "Optional dependencies:\n", + "Pytorch Ignite version: 0.4.11\n", + "ITK version: 5.3.0\n", + "Nibabel version: 5.2.1\n", + "scikit-image version: 0.22.0\n", + "scipy version: 1.13.0\n", + "Pillow version: 10.3.0\n", + "Tensorboard version: 2.16.2\n", + "gdown version: 5.2.0\n", + "TorchVision version: NOT INSTALLED or UNKNOWN VERSION.\n", + "tqdm version: 4.66.4\n", + "lmdb version: 1.4.1\n", + "psutil version: 5.9.0\n", + "pandas version: 2.2.2\n", + "einops version: 0.8.0\n", + "transformers version: 4.40.2\n", + "mlflow version: 2.12.2\n", + "pynrrd version: 1.0.0\n", + "clearml version: 1.16.0rc0\n", + "\n", + "For details about installing the optional dependencies, please visit:\n", + " https://docs.monai.io/en/latest/installation.html#installing-the-recommended-dependencies\n", + "\n" + ] + } + ], + "source": [ + "import os\n", + "import tempfile\n", + "import time\n", + "import shutil\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import torch\n", + "import torch.nn.functional as F\n", + "from torch.amp import GradScaler, autocast\n", + "from tqdm import tqdm\n", + "\n", + "from monai.apps import DecathlonDataset\n", + "from monai.config import print_config\n", + "from monai.data import DataLoader\n", + "from monai.transforms import (\n", + " EnsureChannelFirstd,\n", + " CenterSpatialCropd,\n", + " Compose,\n", + " Lambdad,\n", + " LoadImaged,\n", + " Resized,\n", + " ScaleIntensityd,\n", + ")\n", + "from monai.utils import first, set_determinism\n", + "from monai.inferers import DiffusionInferer\n", + "from monai.networks.nets import DiffusionModelUNet\n", + "from monai.networks.schedulers import DDPMScheduler, DDIMScheduler\n", + "\n", + "print_config()" + ] + }, + { + "cell_type": "markdown", + "id": "50e37a43", + "metadata": {}, + "source": [ + "## Setup data directory\n", + "\n", + "You can specify a directory with the MONAI_DATA_DIRECTORY environment variable.\n", + "\n", + "This allows you to save results and reuse downloads.\n", + "\n", + "If not specified a temporary directory will be used." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c38b4c33", + "metadata": {}, + "outputs": [], + "source": [ + "directory = os.environ.get(\"MONAI_DATA_DIRECTORY\")\n", + "root_dir = tempfile.mkdtemp() if directory is None else directory" + ] + }, + { + "cell_type": "markdown", + "id": "41af1391", + "metadata": {}, + "source": [ + "## Set deterministic training for reproducibility" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "515d8583", + "metadata": {}, + "outputs": [], + "source": [ + "set_determinism(42)" + ] + }, + { + "cell_type": "markdown", + "id": "29d8c601", + "metadata": {}, + "source": [ + "## Setup Decathlon Dataset and training and validation data loaders\n", + "\n", + "In this tutorial, we will use the 3D T1 weighted brain images from the [2016 and 2017 Brain Tumor Segmentation (BraTS) challenges](https://www.med.upenn.edu/sbia/brats2017/data.html). This dataset can be easily downloaded using the [DecathlonDataset](https://docs.monai.io/en/stable/apps.html#monai.apps.DecathlonDataset) from MONAI (`task=\"Task01_BrainTumour\"`). To load the training and validation images, we are using the `data_transform` transformations that are responsible for the following:\n", + "\n", + "1. `LoadImaged`: Loads the brain images from files.\n", + "2. `Lambdad`: Choose channel 1 of the image, which is the T1-weighted image.\n", + "3. `EnsureChannelFirstd`: Add the channel dimension of the input data.\n", + "4. `ScaleIntensityd`: Apply a min-max scaling in the intensity values of each image to be in the `[0, 1]` range.\n", + "5. `CenterSpatialCropd`: Crop the background of the images using a roi of size `[160, 200, 155]`.\n", + "6. `Resized`: Resize the images to a volume with size `[32, 40, 32]`.\n", + "\n", + "For the data loader, we are using mini-batches of 16 images, which consumes about 11GB of GPU memory during training. Please, reduce this value to run on smaller GPUs." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f640d7ac", + "metadata": {}, + "outputs": [], + "source": [ + "data_transform = Compose(\n", + " [\n", + " LoadImaged(keys=[\"image\"]),\n", + " Lambdad(keys=\"image\", func=lambda x: x[:, :, :, 1]),\n", + " EnsureChannelFirstd(keys=[\"image\"], channel_dim=\"no_channel\"),\n", + " ScaleIntensityd(keys=[\"image\"]),\n", + " CenterSpatialCropd(keys=[\"image\"], roi_size=[160, 200, 155]),\n", + " Resized(keys=[\"image\"], spatial_size=(32, 40, 32)),\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddd61e60", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading dataset: 100%|██████████| 388/388 [01:57<00:00, 3.29it/s]\n", + "Loading dataset: 100%|██████████| 96/96 [00:30<00:00, 3.13it/s]\n" + ] + } + ], + "source": [ + "batch_size = 16\n", + "num_workers = 8\n", + "\n", + "train_ds = DecathlonDataset(\n", + " root_dir=root_dir,\n", + " task=\"Task01_BrainTumour\",\n", + " transform=data_transform,\n", + " section=\"training\",\n", + " download=True,\n", + " num_workers=num_workers,\n", + ")\n", + "\n", + "train_loader = DataLoader(\n", + " train_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers, persistent_workers=True\n", + ")\n", + "\n", + "val_ds = DecathlonDataset(\n", + " root_dir=root_dir,\n", + " task=\"Task01_BrainTumour\",\n", + " transform=data_transform,\n", + " section=\"validation\",\n", + " download=True,\n", + " num_workers=num_workers,\n", + ")\n", + "\n", + "val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, persistent_workers=True)" + ] + }, + { + "cell_type": "markdown", + "id": "50efe5ef", + "metadata": {}, + "source": [ + "### Visualization of the training images" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bffb4abc", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.subplots(1, 4, figsize=(10, 6))\n", + "for i in range(4):\n", + " plt.subplot(1, 4, i + 1)\n", + " plt.imshow(train_ds[i * 20][\"image\"][0, :, :, 15].detach().cpu(), vmin=0, vmax=1, cmap=\"gray\")\n", + " plt.axis(\"off\")\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "efc86997", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "img = first(item[\"image\"] for item in train_ds if \"400\" in item[\"image\"].meta[\"filename_or_obj\"])\n", + "imgn = img[0].detach().cpu().numpy()\n", + "f, (ax0, ax1, ax2, ax3) = plt.subplots(1, 4)\n", + "ax0.imshow(imgn[..., 15], vmin=0, vmax=1, cmap=\"gray\")\n", + "ax1.imshow(np.max(imgn, axis=0), vmin=0, vmax=1, cmap=\"gray\")\n", + "ax2.imshow(np.max(imgn, axis=1), vmin=0, vmax=1, cmap=\"gray\")\n", + "ax3.imshow(np.max(imgn, axis=2), vmin=0, vmax=1, cmap=\"gray\")" + ] + }, + { + "cell_type": "markdown", + "id": "d22296e5", + "metadata": {}, + "source": [ + "### Define network, scheduler, optimizer, and inferer\n", + "\n", + "We will use a DDPM in this example; for that, we need to define a `DiffusionModelUNet` network that will have as input the noisy images and the values for the timestep `t`, and it will predict the noise that is present in the image.\n", + "\n", + "In this example, we have a network with three levels (with 256, 256, and 512 channels in each). In every level, we will have two residual blocks, and only the last one will have an attention block with a single attention head (with 512 channels)." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d499f7b1", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "device = torch.device(\"cuda\")\n", + "\n", + "model = DiffusionModelUNet(\n", + " spatial_dims=3,\n", + " in_channels=1,\n", + " out_channels=1,\n", + " channels=[64, 64, 128],\n", + " attention_levels=[False, False, True],\n", + " num_head_channels=[0, 0, 128],\n", + " num_res_blocks=2,\n", + ").to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "47ad91ff", + "metadata": {}, + "source": [ + "Together with our U-net, we need to define the Noise Scheduler for the diffusion model. This scheduler is responsible for defining the amount of noise that should be added in each timestep `t` of the diffusion model's Markov chain. Besides that, it has the operations to perform the reverse process, which will remove the noise of the images (a.k.a. denoising process). In this case, we are using a `DDPMScheduler`. Here we are using 1000 timesteps and a `scaled_linear` profile for the beta values (proposed in [Rombach et al. \"High-Resolution Image Synthesis with Latent Diffusion Models\"](https://arxiv.org/abs/2112.10752)). This profile had better results than the `linear, proposed in the original DDPM's paper. In `beta_start` and `beta_end`, we define the limits for the beta values. These are important to determine how accentuated is the addition of noise in the image." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "6c1de5ad", + "metadata": {}, + "outputs": [], + "source": [ + "num_train_timesteps = 1000\n", + "scheduler = DDPMScheduler(\n", + " num_train_timesteps=num_train_timesteps,\n", + " schedule=\"scaled_linear_beta\",\n", + " beta_start=0.0005,\n", + " beta_end=0.0195,\n", + " clip_sample=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "36d3e99a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'alpha cumprod')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(scheduler.alphas_cumprod.cpu(), color=(2 / 255, 163 / 255, 163 / 255), linewidth=2)\n", + "plt.xlabel(\"Timestep [t]\")\n", + "plt.ylabel(\"alpha cumprod\")" + ] + }, + { + "cell_type": "markdown", + "id": "9125f7c8", + "metadata": {}, + "source": [ + "Finally, we define the Inferer, which contains functions that will help during the training and sampling of the model, and the optimizer." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "8685da6e", + "metadata": {}, + "outputs": [], + "source": [ + "inferer = DiffusionInferer(scheduler)\n", + "optimizer = torch.optim.Adam(params=model.parameters(), lr=5e-5)" + ] + }, + { + "cell_type": "markdown", + "id": "9f371ad8", + "metadata": {}, + "source": [ + "## Model training\n", + "\n", + "In this part, we will train the diffusion model to predict the noise added to the images. For this, we are using an MSE loss between the prediction and the original noise. During the training, we are also sampling brain images to evaluate the evolution of the model. In this training, we use Automatic Mixed Precision to save memory and speed up the training." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "bd10b595", + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Epoch 00: 100%|██████████| 25/25 [00:35<00:00, 1.41s/it, loss=0.8318]\n", + "Epoch 01: 100%|██████████| 25/25 [00:35<00:00, 1.43s/it, loss=0.4999]\n", + "Epoch 02: 100%|██████████| 25/25 [00:37<00:00, 1.48s/it, loss=0.2719]\n", + "Epoch 03: 100%|██████████| 25/25 [00:36<00:00, 1.47s/it, loss=0.1401]\n", + "Epoch 04: 100%|██████████| 25/25 [00:36<00:00, 1.47s/it, loss=0.0709]\n", + "Epoch 05: 100%|██████████| 25/25 [00:37<00:00, 1.48s/it, loss=0.0394]\n", + "Epoch 06: 100%|██████████| 25/25 [00:36<00:00, 1.48s/it, loss=0.0207]\n", + "Epoch 07: 100%|██████████| 25/25 [00:37<00:00, 1.49s/it, loss=0.0145]\n", + "...\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAKIAAACGCAYAAABez1E7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8fJSN1AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAK8UlEQVR4nO2dTWyNTRTHp74pqrQoKhoJQUJZEAmLsiCxkzQSVhasbHSBhT0LJBLsrIhIrCQSi640EhEb4rtolPgopfVR3x/v5n0n//Nv77zTq+09lf9vdSbn3vs889yTOec5c2am7Pfv37+DECVmRKlvQIgQZIjCCTJE4QIZonCBDFG4QIYoXCBDFC6QIQoXyBCFC0blfnDkyJGmPW7cuCh/+vQp+3s/f/6M8tixY42OJ3lGjBhRUPf169cojx8/3uhGjx6ddX1ul5WV9e7Av3z8+NG08Xf53vD6DPYphBA+f/5c8LPl5eWm/evXryhj/1mX6vOUKVOMrru727T5WSLYzy9fvhgd96vQ9QuhEVG4QIYoXFCWW/QwatSogu3JkycbHQ73379/txcE98cujN0GfvfHjx9G19PTU/D63759izKHDexCsPtVVVVGh26T+4/uBu8lhBBmzpxp2ujG3r59a3QVFRVRZnfLLhZht43X4OeKOg4F+NnhM+dQBZ/d+/fvjS4V1mDYUAiNiMIFMkThguy3Zn7zSQ3TEyZMiPK7d+8Kfm/ixIlG9+rVK9OeNGlSlNHdhhBCTU1NlPmNFt/8+Hv8Voju8MOHD0aH7phDDHSN/GzevHlT8F45NEC3PmbMGKPjsCL1po794N/B+2M3yWEF9pndP/6X3A/8LD/HHDQiChfIEIULZIjCBdkxIs+CcKoBwfQBv9ZjjMLZeUxlhGDjF07fYHzJOoyfWMf3jb+TSldwbIXXWLRokdE9ffrUtNvb26NcXV1tdPis+F5TM098PxhPp/rIz5x/B+NQju8x9kzNpOCsWy4aEYULZIjCBdkzKzwU4xDOaRieSEcwfcJZ/srKStPG9MWMGTOMDm979uzZRocuhL/H7g7bnAZCHYcNnZ2dUWZXzGkofHb4vRCsa2Q3yembadOmRZnTLphe4tmSrq6uKHP6KhXWsItFV51TyNDXbxZCI6JwgQxRuECGKFyQnb5hP4+pDp5GwyoWjnuw+oRjS04tzJkzJ8pc0bJp06Yoc/qkvr4+yjzdxNNWGDNxbIvX7OjoMDrs48OHD42OpwMxncLTf/g82trajO7WrVsFf4djRIw9OdbFWC8V94Vg/1dMCYVgY03+Hexzqri2EBoRhQtkiMIFMkThguw8YqoCl8HYj3NhGFusX7/e6BobG0172bJlUV66dGn29UtNqkSrmOmvHFpaWqJ87tw5oztz5kyUuUKcc7AYJ3PuGPvB7wW506GF0IgoXCBDFC7Ids08NYZVIykdp0SampqivGPHDqOrq6vLuZVBg6etcPqLF0/h9CSnoV6+fGnanHoaas6ePRvlnTt3Gh2nt3B6lBezYZjFZsPpJESuWQwbZIjCBTJE4YLsGJEXe6cWTU+fPj3K27ZtM7ojR4705/7cwGkojIt52o5TXVyVXUqOHj1q2nv27DFtnLp8/vy50WEpHMeWaA+4ijOEdPz4HxoRhQtkiMIF2a6Zh9tUhUVDQ0OUT506lf09MfTs37/ftA8fPhxl3kMH/zt2t9ieOnWq0XHo0hcaEYULZIjCBTJE4YKiq29qa2ujzBXB58+fj/LatWv/5P7EINPa2mraW7ZsifL169eNDm2AVwpipTen+nilYF9oRBQukCEKF8gQhQuyY0TO/2EcsHr1aqNrbm4egFsTpQBLxrZu3Wp0OHXLKy6xzfEj73zRFxoRhQtkiMIFRe+hjUPxhg0bBu6ORElZsmRJlDFFF4KtuOHqK1wU9vr1635fVyOicIEMUbhAhihckB0j8tkduKoNF8KL4c2CBQuivHHjRqPD1E7qLBXcUDQXjYjCBTJE4YKi90fExdf92RfnbwGfx//1P7VQ3xu4KIxTNHjUHW8agCmbnGobRiOicIEMUbhAhihcUHTAghUWqeOw/hZ4P0Bs37171+g41YWr2jiexH3CvcEVNgjvvY0bUfGxyTn8/RYkhgUyROGComdWcC+YR48eGd26dev+8LZ6w8eloYsbrO2AEV5sjqROCg3Bbhc8f/58o8PtgHmfyVLDMyTz5s2LMrtt3BOymBSVRkThAhmicIEMUbgg25nzsV64sU4xUzr9pdSbN3HaBWPmFStWGB1vOIDpLU5tzJo1a6BuccCpqakxbay4Sf3n/D6Rg0ZE4QIZonCBDFG4IDtGTJ0z4r20abDhHGNlZWWyPVzgWBdjP44D8bO84jMHjYjCBTJE4YKifSqmJG7fvj0gNyN8wRVHL168iDIfj4bTk7w/Yg4aEYULZIjCBTJE4YLsGJH9PpYIXbp0aeDuSLghNVXHpW9YFljMdKxGROECGaJwQbZrxirjEOyCID4G4fLly1Fes2ZNkbcmSg1XYeMxeKkFUppZEcMWGaJwgQxRuKDoKT58Xefqm3379kX5+PHjRqe9FH3T0dER5fb2dqPDKT9cUB+CreDXKj4xbJEhChfIEIULsp05l/3gqjaeCmpra4vygQMHjG7v3r1RXr58ee7lhzWYV+PSKqxsrq6uHrJ7KsTFixejfOPGDaPDlXu8cSv2q5hNuTQiChfIEIULsl0zb4JUXl4e5c7OTqPD6b+Wlhajw9d+dtseXNNAgBsShWCfDx8LsXDhwoI63sxpMDh27JhpHzx4MMrPnj0zOnS5PI2HrrqYzaQ0IgoXyBCFC2SIwgXZJ9jj6eUh2D2UuVoX40eOLbGye/HixUZ36NAh066vr8+5tZKA54rwBk1cPoVnkvBp7kOxCROW8J04ccLoTp48adpdXV1R5o2WcENU3ISL4Q0HOGXVFxoRhQtkiMIF2a65oqLCtHG45WoLXDzDr/k49PP3Nm/ebNq7du2K8qpVq4wutaf1UMMntnO/cFaKQxXsx5/06erVq1E+ffq00V24cCHK3d3dBa8fQgg9PT1RZteM/WJ3iykbrMwKoXfo1hcaEYULZIjCBTJE4YLsGJGnbXDBPeswRsSNe0KwqR1cFRZC75TAypUro9zY2Gh0TU1NUeYYZKCOZMMzUHi6C48uSx0HFoKNC+/du2d0eJQcyiGE0NzcbNpPnjyJ8pUrV4wOz7rB6ieG740rp1JTdRgz8p7q/AwK/WYhNCIKF8gQhQtkiMIF2TEiT2NhiRLnjTA3hXFWCDZG4RwWxySYt+JNoOrq6qK8e/duo2toaOjzev0Fq6f5vD/Mh2LcG0LvsrjW1tYo37x50+hw1dydO3eM7tq1awWvyflZfJac/8N4jv9Hzg/jM+d+IFVVVaaN04gco3M82RcaEYULZIjCBdmumV0juiOuLMbP8nQXuhAestnF4We5agVdJS/sqq2tjfL27duNjquI0DXxkV9z586N8uPHj40ON5568OCB0d2/f9+0U6kV7AenQDgthamWVLqEweM1UvsahmDdKv93OP3HeyBiiopDLK5G6guNiMIFMkThAhmicEHRZWAYS3C6AGMJ/nmcxuLUDrcxRcTpCowvOZWAMUkqlRGCjbs4fsT0BZdvpc4S4T6nSr3ws/jcQrDV7CHYcjOO9fD58NQpXpNjS/5fMWWV2gubj0dLoSk+MWyQIQoXZLtmnqFIVd2ia+SKEnQbXJnDGXmcBUhVgacy9+yaudoEXS5/Ft0vhwaYdmF3m9o7kNMl+PjZ/XMaJFVVhN9NpVZw84MQeqdWUpXWeK9coY3Pimehcty4RkThAhmicIEMUbggO0YUYjDRiChcIEMULpAhChfIEIULZIjCBTJE4QIZonCBDFG4QIYoXPAP9l6XqmB6Ay4AAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train completed, total time: 1676.1360158920288.\n" + ] + } + ], + "source": [ + "max_epochs = 40\n", + "val_interval = max(1, max_epochs // 5)\n", + "use_amp = False\n", + "epoch_loss_list = []\n", + "val_epoch_loss_list = []\n", + "\n", + "scaler = GradScaler()\n", + "total_start = time.time()\n", + "for epoch in range(max_epochs):\n", + " model.train()\n", + " epoch_loss = 0\n", + " progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), ncols=70)\n", + " progress_bar.set_description(f\"Epoch {epoch:02}\")\n", + " for step, batch in progress_bar:\n", + " images = batch[\"image\"].to(device)\n", + " optimizer.zero_grad(set_to_none=True)\n", + "\n", + " with autocast(device_type=\"cuda\", enabled=use_amp):\n", + " # Generate random noise\n", + " noise = torch.randn_like(images).to(device)\n", + "\n", + " # Create timesteps\n", + " timesteps = torch.randint(0, num_train_timesteps, (images.shape[0],), device=images.device).long()\n", + "\n", + " # Get model prediction\n", + " noise_pred = inferer(inputs=images, diffusion_model=model, noise=noise, timesteps=timesteps)\n", + "\n", + " loss = F.mse_loss(noise_pred.float(), noise.float())\n", + "\n", + " if use_amp:\n", + " scaler.scale(loss).backward()\n", + " scaler.step(optimizer)\n", + " scaler.update()\n", + " else:\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " epoch_loss += loss.item()\n", + "\n", + " progress_bar.set_postfix({\"loss\": f\"{epoch_loss / (step + 1):.4f}\"})\n", + " epoch_loss_list.append(epoch_loss / (step + 1))\n", + "\n", + " if (epoch + 1) % val_interval == 0:\n", + " model.eval()\n", + "\n", + " torch.save(model.state_dict(), f\"model_{epoch:04}.pth\")\n", + "\n", + " val_epoch_loss = 0\n", + " for step, batch in enumerate(val_loader):\n", + " images = batch[\"image\"].to(device)\n", + " noise = torch.randn_like(images).to(device)\n", + " with torch.no_grad(), autocast(device_type=\"cuda\", enabled=use_amp):\n", + " timesteps = torch.randint(0, num_train_timesteps, (images.shape[0],), device=images.device).long()\n", + "\n", + " # Get model prediction\n", + " noise_pred = inferer(inputs=images, diffusion_model=model, noise=noise, timesteps=timesteps)\n", + " val_loss = F.mse_loss(noise_pred.float(), noise.float())\n", + "\n", + " val_epoch_loss += val_loss.item()\n", + " progress_bar.set_postfix({\"val_loss\": val_epoch_loss / (step + 1)})\n", + " val_epoch_loss_list.append(val_epoch_loss / (step + 1))\n", + "\n", + " # Sampling image during training\n", + " image = torch.randn((1, 1, 32, 40, 32))\n", + " image = image.to(device)\n", + " scheduler.set_timesteps(num_inference_steps=1000)\n", + " with autocast(device_type=\"cuda\", enabled=use_amp):\n", + " image = inferer.sample(input_noise=image, diffusion_model=model, scheduler=scheduler)\n", + "\n", + " plt.figure(figsize=(2, 2))\n", + " plt.imshow(image[0, 0, :, :, 15].cpu(), vmin=0, vmax=1, cmap=\"gray\")\n", + " plt.tight_layout()\n", + " plt.axis(\"off\")\n", + " plt.show()\n", + "\n", + "total_time = time.time() - total_start\n", + "print(f\"train completed, total time: {total_time}.\")" + ] + }, + { + "cell_type": "markdown", + "id": "3e263b67", + "metadata": {}, + "source": [ + "### Learning curves" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c7520419", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.style.use(\"seaborn-v0_8\")\n", + "plt.title(\"Learning Curves\", fontsize=20)\n", + "plt.plot(np.linspace(1, max_epochs, max_epochs), epoch_loss_list, color=\"C0\", linewidth=2.0, label=\"Train\")\n", + "plt.plot(\n", + " np.linspace(val_interval, max_epochs, int(max_epochs / val_interval)),\n", + " val_epoch_loss_list,\n", + " color=\"C1\",\n", + " linewidth=2.0,\n", + " label=\"Validation\",\n", + ")\n", + "plt.yticks(fontsize=12)\n", + "plt.xticks(fontsize=12)\n", + "plt.xlabel(\"Epochs\", fontsize=16)\n", + "plt.ylabel(\"Loss\", fontsize=16)\n", + "plt.legend(prop={\"size\": 14})\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "38724c9b", + "metadata": {}, + "source": [ + "## Sampling Brain Image\n", + "\n", + "In order to sample the brain images, we need to pass the model an image containing just noise and use it to remove the noise of the image iteratively. For that, we will use the `.sample()` function of the `inferer`." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "092eb6a0", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1000 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.style.use(\"default\")\n", + "plotting_image_0 = np.concatenate([image[0, 0, :, :, 15].cpu(), np.flipud(image[0, 0, :, 20, :].cpu().T)], axis=1)\n", + "plotting_image_1 = np.concatenate([np.flipud(image[0, 0, 15, :, :].cpu().T), np.zeros((32, 32))], axis=1)\n", + "plt.imshow(np.concatenate([plotting_image_0, plotting_image_1], axis=0), vmin=0, vmax=1, cmap=\"gray\")\n", + "plt.tight_layout()\n", + "plt.axis(\"off\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "f0acc27a", + "metadata": {}, + "source": [ + "### Sampling with Denoising Diffusion Implicit Model Scheduler\n", + "\n", + "Recent papers have proposed different ways to improve the sampling speed by using fewer steps in the denoising process. In this example, we are using a `DDIMScheduler` (from [Song et al. \"Denoising Diffusion Implicit Models\"](https://arxiv.org/abs/2010.02502)) to reduce the original number of steps from 1000 to 250." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e3e43b95", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250/250 [00:07<00:00, 32.79it/s]\n" + ] + } + ], + "source": [ + "scheduler_ddim = DDIMScheduler(\n", + " num_train_timesteps=num_train_timesteps,\n", + " schedule=\"scaled_linear_beta\",\n", + " beta_start=0.0005,\n", + " beta_end=0.0195,\n", + " clip_sample=False,\n", + ")\n", + "\n", + "scheduler_ddim.set_timesteps(num_inference_steps=250)\n", + "\n", + "model.eval()\n", + "noise = torch.randn((1, 1, 32, 40, 32))\n", + "noise = noise.to(device)\n", + "\n", + "image = inferer.sample(input_noise=noise, diffusion_model=model, scheduler=scheduler_ddim)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "89f93ab8", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.style.use(\"default\")\n", + "plotting_image_0 = np.concatenate([image[0, 0, :, :, 15].cpu(), np.flipud(image[0, 0, :, 20, :].cpu().T)], axis=1)\n", + "plotting_image_1 = np.concatenate([np.flipud(image[0, 0, 15, :, :].cpu().T), np.zeros((32, 32))], axis=1)\n", + "plt.imshow(np.concatenate([plotting_image_0, plotting_image_1], axis=0), vmin=0, vmax=1, cmap=\"gray\")\n", + "plt.tight_layout()\n", + "plt.axis(\"off\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "b39a47bb", + "metadata": {}, + "source": [ + "### Cleanup data directory\n", + "\n", + "Remove directory if a temporary was used." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "957a45ff", + "metadata": {}, + "outputs": [], + "source": [ + "if directory is None:\n", + " shutil.rmtree(root_dir)" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "py:percent,ipynb" + }, + "kernelspec": { + "display_name": "monai", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/generation/README.md b/generation/README.md index 351416fd1..f2932f6b8 100644 --- a/generation/README.md +++ b/generation/README.md @@ -49,6 +49,9 @@ Examples show how to train Vector Quantized Variation Autoencoder on [2D](./2d_v ## [Training a 2D Denoising Diffusion Probabilistic Model](./2d_ddpm/2d_ddpm_tutorial.ipynb): Example shows how to easily train a DDPM on medical data (MedNIST). +## [Training a 3D Denoising Diffusion Probabilistic Model](./3d_ddpm/3d_ddpm_tutorial.ipynb): +Example shows how to easily train a DDPM on medical data (Decathlon Task 01). + ## [Comparing different noise schedulers](./2d_ddpm/2d_ddpm_compare_schedulers.ipynb): Example compares the performance of different noise schedulers. This shows how to sample a diffusion model using the DDPM, DDIM, and PNDM schedulers and how different numbers of timesteps affect the quality of the samples.