Fix maisi diffusion net single gpu training issue (#1851)

Fix maisi diffusion net single gpu training issue ### Checks  - [x] Avoid including large-size files in the PR. - [x] Clean up long text outputs from code cells in the notebook. - [x] For security purposes, please check the contents and remove any sensitive info such as user names and private key. - [x] Ensure (1) hyperlinks and markdown anchors are working (2) use relative paths for tutorial repo files (3) put figure and graphs in the `./figure` folder - [x] Notebook runs automatically `./runner.sh -t <path to .ipynb file>` --------- Signed-off-by: YunLiu <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Project-MONAI · Sep 30, 2024 · 071eb1a · 071eb1a
1 parent e4cf547
commit 071eb1a
Show file tree

Hide file tree

Showing 6 changed files with 168 additions and 78 deletions.
diff --git a/generation/maisi/maisi_diff_unet_training_tutorial.ipynb b/generation/maisi/maisi_diff_unet_training_tutorial.ipynb
@@ -336,6 +336,8 @@
     "    model_config_filepath,\n",
     "    \"--model_def\",\n",
     "    model_def_filepath,\n",
+    "    \"--num_gpus\",\n",
+    "    str(num_gpus),\n",
     "]\n",
     "\n",
     "run_torchrun(module, module_args, num_gpus=num_gpus)"
@@ -457,17 +459,17 @@
       "INFO:training:[config] num_train_timesteps -> 1000.\n",
       "INFO:training:num_files_train: 2\n",
       "INFO:training:Training from scratch.\n",
-      "INFO:training:Scaling factor set to 0.89132159948349.\n",
-      "INFO:training:scale_factor -> 0.89132159948349.\n",
+      "INFO:training:Scaling factor set to 0.8903454542160034.\n",
+      "INFO:training:scale_factor -> 0.8903454542160034.\n",
       "INFO:training:torch.set_float32_matmul_precision -> highest.\n",
       "INFO:training:Epoch 1, lr 0.0001.\n",
-      "INFO:training:[2024-09-24 03:46:57] epoch 1, iter 1/2, loss: 0.7984, lr: 0.000100000000.\n",
-      "INFO:training:[2024-09-24 03:46:58] epoch 1, iter 2/2, loss: 0.7911, lr: 0.000056250000.\n",
-      "INFO:training:epoch 1 average loss: 0.7947.\n",
+      "INFO:training:[2024-09-30 06:30:33] epoch 1, iter 1/2, loss: 0.7974, lr: 0.000100000000.\n",
+      "INFO:training:[2024-09-30 06:30:33] epoch 1, iter 2/2, loss: 0.7939, lr: 0.000056250000.\n",
+      "INFO:training:epoch 1 average loss: 0.7957.\n",
       "INFO:training:Epoch 2, lr 2.5e-05.\n",
-      "INFO:training:[2024-09-24 03:46:59] epoch 2, iter 1/2, loss: 0.7910, lr: 0.000025000000.\n",
-      "INFO:training:[2024-09-24 03:46:59] epoch 2, iter 2/2, loss: 0.7897, lr: 0.000006250000.\n",
-      "INFO:training:epoch 2 average loss: 0.7903.\n",
+      "INFO:training:[2024-09-30 06:30:35] epoch 2, iter 1/2, loss: 0.7902, lr: 0.000025000000.\n",
+      "INFO:training:[2024-09-30 06:30:35] epoch 2, iter 2/2, loss: 0.7889, lr: 0.000006250000.\n",
+      "INFO:training:epoch 2 average loss: 0.7895.\n",
       "\n"
      ]
     }
@@ -484,6 +486,8 @@
     "    model_config_filepath,\n",
     "    \"--model_def\",\n",
     "    model_def_filepath,\n",
+    "    \"--num_gpus\",\n",
+    "    str(num_gpus),\n",
     "]\n",
     "\n",
     "run_torchrun(module, module_args, num_gpus=num_gpus)"
@@ -518,24 +522,24 @@
      "output_type": "stream",
      "text": [
       "\n",
-      "INFO:inference:Using cuda:0 of 1 with random seed: 62801\n",
+      "INFO:inference:Using cuda:0 of 1 with random seed: 93612\n",
       "INFO:inference:[config] ckpt_filepath -> ./temp_work_dir/./models/diff_unet_ckpt.pt.\n",
-      "INFO:inference:[config] random_seed -> 62801.\n",
+      "INFO:inference:[config] random_seed -> 93612.\n",
       "INFO:inference:[config] output_prefix -> unet_3d.\n",
       "INFO:inference:[config] output_size -> (256, 256, 128).\n",
       "INFO:inference:[config] out_spacing -> (1.0, 1.0, 0.75).\n",
       "INFO:root:`controllable_anatomy_size` is not provided.\n",
       "INFO:inference:checkpoints ./temp_work_dir/./models/diff_unet_ckpt.pt loaded.\n",
-      "INFO:inference:scale_factor -> 0.89132159948349.\n",
+      "INFO:inference:scale_factor -> 0.8903454542160034.\n",
       "INFO:inference:num_downsample_level -> 4, divisor -> 4.\n",
       "INFO:inference:noise: cuda:0, torch.float32, <class 'torch.Tensor'>\n",
       "\n",
       "  0%|                                                                                  | 0/10 [00:00<?, ?it/s]\n",
-      " 10%|███████▍                                                                  | 1/10 [00:00<00:02,  3.62it/s]\n",
-      " 40%|█████████████████████████████▌                                            | 4/10 [00:00<00:00, 12.53it/s]\n",
-      " 80%|███████████████████████████████████████████████████████████▏              | 8/10 [00:00<00:00, 19.54it/s]\n",
-      "100%|█████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 18.16it/s]\n",
-      "INFO:inference:Saved ./temp_work_dir/./predictions/unet_3d_seed62801_size256x256x128_spacing1.00x1.00x0.75_20240924034721.nii.gz.\n",
+      " 10%|███████▍                                                                  | 1/10 [00:00<00:02,  3.48it/s]\n",
+      " 40%|█████████████████████████████▌                                            | 4/10 [00:00<00:00, 12.23it/s]\n",
+      " 80%|███████████████████████████████████████████████████████████▏              | 8/10 [00:00<00:00, 19.26it/s]\n",
+      "100%|█████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 17.80it/s]\n",
+      "INFO:inference:Saved ./temp_work_dir/./predictions/unet_3d_seed93612_size256x256x128_spacing1.00x1.00x0.75_20240930063144_rank0.nii.gz.\n",
       "\n"
      ]
     }
@@ -552,6 +556,8 @@
     "    model_config_filepath,\n",
     "    \"--model_def\",\n",
     "    model_def_filepath,\n",
+    "    \"--num_gpus\",\n",
+    "    str(num_gpus),\n",
     "]\n",
     "\n",
     "run_torchrun(module, module_args, num_gpus=num_gpus)\n",
@@ -562,7 +568,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },