Skip to content

Commit

Permalink
Fix maisi diffusion net single gpu training issue (#1851)
Browse files Browse the repository at this point in the history
Fix maisi diffusion net single gpu training issue


### Checks
<!--- Put an `x` in all the boxes that apply, and remove the not
applicable items -->
- [x] Avoid including large-size files in the PR.
- [x] Clean up long text outputs from code cells in the notebook.
- [x] For security purposes, please check the contents and remove any
sensitive info such as user names and private key.
- [x] Ensure (1) hyperlinks and markdown anchors are working (2) use
relative paths for tutorial repo files (3) put figure and graphs in the
`./figure` folder
- [x] Notebook runs automatically `./runner.sh -t <path to .ipynb file>`

---------

Signed-off-by: YunLiu <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
KumoLiu and pre-commit-ci[bot] authored Sep 30, 2024
1 parent e4cf547 commit 071eb1a
Show file tree
Hide file tree
Showing 6 changed files with 168 additions and 78 deletions.
40 changes: 23 additions & 17 deletions generation/maisi/maisi_diff_unet_training_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,8 @@
" model_config_filepath,\n",
" \"--model_def\",\n",
" model_def_filepath,\n",
" \"--num_gpus\",\n",
" str(num_gpus),\n",
"]\n",
"\n",
"run_torchrun(module, module_args, num_gpus=num_gpus)"
Expand Down Expand Up @@ -457,17 +459,17 @@
"INFO:training:[config] num_train_timesteps -> 1000.\n",
"INFO:training:num_files_train: 2\n",
"INFO:training:Training from scratch.\n",
"INFO:training:Scaling factor set to 0.89132159948349.\n",
"INFO:training:scale_factor -> 0.89132159948349.\n",
"INFO:training:Scaling factor set to 0.8903454542160034.\n",
"INFO:training:scale_factor -> 0.8903454542160034.\n",
"INFO:training:torch.set_float32_matmul_precision -> highest.\n",
"INFO:training:Epoch 1, lr 0.0001.\n",
"INFO:training:[2024-09-24 03:46:57] epoch 1, iter 1/2, loss: 0.7984, lr: 0.000100000000.\n",
"INFO:training:[2024-09-24 03:46:58] epoch 1, iter 2/2, loss: 0.7911, lr: 0.000056250000.\n",
"INFO:training:epoch 1 average loss: 0.7947.\n",
"INFO:training:[2024-09-30 06:30:33] epoch 1, iter 1/2, loss: 0.7974, lr: 0.000100000000.\n",
"INFO:training:[2024-09-30 06:30:33] epoch 1, iter 2/2, loss: 0.7939, lr: 0.000056250000.\n",
"INFO:training:epoch 1 average loss: 0.7957.\n",
"INFO:training:Epoch 2, lr 2.5e-05.\n",
"INFO:training:[2024-09-24 03:46:59] epoch 2, iter 1/2, loss: 0.7910, lr: 0.000025000000.\n",
"INFO:training:[2024-09-24 03:46:59] epoch 2, iter 2/2, loss: 0.7897, lr: 0.000006250000.\n",
"INFO:training:epoch 2 average loss: 0.7903.\n",
"INFO:training:[2024-09-30 06:30:35] epoch 2, iter 1/2, loss: 0.7902, lr: 0.000025000000.\n",
"INFO:training:[2024-09-30 06:30:35] epoch 2, iter 2/2, loss: 0.7889, lr: 0.000006250000.\n",
"INFO:training:epoch 2 average loss: 0.7895.\n",
"\n"
]
}
Expand All @@ -484,6 +486,8 @@
" model_config_filepath,\n",
" \"--model_def\",\n",
" model_def_filepath,\n",
" \"--num_gpus\",\n",
" str(num_gpus),\n",
"]\n",
"\n",
"run_torchrun(module, module_args, num_gpus=num_gpus)"
Expand Down Expand Up @@ -518,24 +522,24 @@
"output_type": "stream",
"text": [
"\n",
"INFO:inference:Using cuda:0 of 1 with random seed: 62801\n",
"INFO:inference:Using cuda:0 of 1 with random seed: 93612\n",
"INFO:inference:[config] ckpt_filepath -> ./temp_work_dir/./models/diff_unet_ckpt.pt.\n",
"INFO:inference:[config] random_seed -> 62801.\n",
"INFO:inference:[config] random_seed -> 93612.\n",
"INFO:inference:[config] output_prefix -> unet_3d.\n",
"INFO:inference:[config] output_size -> (256, 256, 128).\n",
"INFO:inference:[config] out_spacing -> (1.0, 1.0, 0.75).\n",
"INFO:root:`controllable_anatomy_size` is not provided.\n",
"INFO:inference:checkpoints ./temp_work_dir/./models/diff_unet_ckpt.pt loaded.\n",
"INFO:inference:scale_factor -> 0.89132159948349.\n",
"INFO:inference:scale_factor -> 0.8903454542160034.\n",
"INFO:inference:num_downsample_level -> 4, divisor -> 4.\n",
"INFO:inference:noise: cuda:0, torch.float32, <class 'torch.Tensor'>\n",
"\n",
" 0%| | 0/10 [00:00<?, ?it/s]\n",
" 10%|███████▍ | 1/10 [00:00<00:02, 3.62it/s]\n",
" 40%|█████████████████████████████▌ | 4/10 [00:00<00:00, 12.53it/s]\n",
" 80%|███████████████████████████████████████████████████████████▏ | 8/10 [00:00<00:00, 19.54it/s]\n",
"100%|█████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 18.16it/s]\n",
"INFO:inference:Saved ./temp_work_dir/./predictions/unet_3d_seed62801_size256x256x128_spacing1.00x1.00x0.75_20240924034721.nii.gz.\n",
" 10%|███████▍ | 1/10 [00:00<00:02, 3.48it/s]\n",
" 40%|█████████████████████████████▌ | 4/10 [00:00<00:00, 12.23it/s]\n",
" 80%|███████████████████████████████████████████████████████████▏ | 8/10 [00:00<00:00, 19.26it/s]\n",
"100%|█████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 17.80it/s]\n",
"INFO:inference:Saved ./temp_work_dir/./predictions/unet_3d_seed93612_size256x256x128_spacing1.00x1.00x0.75_20240930063144_rank0.nii.gz.\n",
"\n"
]
}
Expand All @@ -552,6 +556,8 @@
" model_config_filepath,\n",
" \"--model_def\",\n",
" model_def_filepath,\n",
" \"--num_gpus\",\n",
" str(num_gpus),\n",
"]\n",
"\n",
"run_torchrun(module, module_args, num_gpus=num_gpus)\n",
Expand All @@ -562,7 +568,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
Expand Down
Loading

0 comments on commit 071eb1a

Please sign in to comment.