Skip to content

Commit 071eb1a

Browse files
Fix maisi diffusion net single gpu training issue (#1851)
Fix maisi diffusion net single gpu training issue ### Checks <!--- Put an `x` in all the boxes that apply, and remove the not applicable items --> - [x] Avoid including large-size files in the PR. - [x] Clean up long text outputs from code cells in the notebook. - [x] For security purposes, please check the contents and remove any sensitive info such as user names and private key. - [x] Ensure (1) hyperlinks and markdown anchors are working (2) use relative paths for tutorial repo files (3) put figure and graphs in the `./figure` folder - [x] Notebook runs automatically `./runner.sh -t <path to .ipynb file>` --------- Signed-off-by: YunLiu <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent e4cf547 commit 071eb1a

File tree

6 files changed

+168
-78
lines changed

6 files changed

+168
-78
lines changed

generation/maisi/maisi_diff_unet_training_tutorial.ipynb

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,8 @@
336336
" model_config_filepath,\n",
337337
" \"--model_def\",\n",
338338
" model_def_filepath,\n",
339+
" \"--num_gpus\",\n",
340+
" str(num_gpus),\n",
339341
"]\n",
340342
"\n",
341343
"run_torchrun(module, module_args, num_gpus=num_gpus)"
@@ -457,17 +459,17 @@
457459
"INFO:training:[config] num_train_timesteps -> 1000.\n",
458460
"INFO:training:num_files_train: 2\n",
459461
"INFO:training:Training from scratch.\n",
460-
"INFO:training:Scaling factor set to 0.89132159948349.\n",
461-
"INFO:training:scale_factor -> 0.89132159948349.\n",
462+
"INFO:training:Scaling factor set to 0.8903454542160034.\n",
463+
"INFO:training:scale_factor -> 0.8903454542160034.\n",
462464
"INFO:training:torch.set_float32_matmul_precision -> highest.\n",
463465
"INFO:training:Epoch 1, lr 0.0001.\n",
464-
"INFO:training:[2024-09-24 03:46:57] epoch 1, iter 1/2, loss: 0.7984, lr: 0.000100000000.\n",
465-
"INFO:training:[2024-09-24 03:46:58] epoch 1, iter 2/2, loss: 0.7911, lr: 0.000056250000.\n",
466-
"INFO:training:epoch 1 average loss: 0.7947.\n",
466+
"INFO:training:[2024-09-30 06:30:33] epoch 1, iter 1/2, loss: 0.7974, lr: 0.000100000000.\n",
467+
"INFO:training:[2024-09-30 06:30:33] epoch 1, iter 2/2, loss: 0.7939, lr: 0.000056250000.\n",
468+
"INFO:training:epoch 1 average loss: 0.7957.\n",
467469
"INFO:training:Epoch 2, lr 2.5e-05.\n",
468-
"INFO:training:[2024-09-24 03:46:59] epoch 2, iter 1/2, loss: 0.7910, lr: 0.000025000000.\n",
469-
"INFO:training:[2024-09-24 03:46:59] epoch 2, iter 2/2, loss: 0.7897, lr: 0.000006250000.\n",
470-
"INFO:training:epoch 2 average loss: 0.7903.\n",
470+
"INFO:training:[2024-09-30 06:30:35] epoch 2, iter 1/2, loss: 0.7902, lr: 0.000025000000.\n",
471+
"INFO:training:[2024-09-30 06:30:35] epoch 2, iter 2/2, loss: 0.7889, lr: 0.000006250000.\n",
472+
"INFO:training:epoch 2 average loss: 0.7895.\n",
471473
"\n"
472474
]
473475
}
@@ -484,6 +486,8 @@
484486
" model_config_filepath,\n",
485487
" \"--model_def\",\n",
486488
" model_def_filepath,\n",
489+
" \"--num_gpus\",\n",
490+
" str(num_gpus),\n",
487491
"]\n",
488492
"\n",
489493
"run_torchrun(module, module_args, num_gpus=num_gpus)"
@@ -518,24 +522,24 @@
518522
"output_type": "stream",
519523
"text": [
520524
"\n",
521-
"INFO:inference:Using cuda:0 of 1 with random seed: 62801\n",
525+
"INFO:inference:Using cuda:0 of 1 with random seed: 93612\n",
522526
"INFO:inference:[config] ckpt_filepath -> ./temp_work_dir/./models/diff_unet_ckpt.pt.\n",
523-
"INFO:inference:[config] random_seed -> 62801.\n",
527+
"INFO:inference:[config] random_seed -> 93612.\n",
524528
"INFO:inference:[config] output_prefix -> unet_3d.\n",
525529
"INFO:inference:[config] output_size -> (256, 256, 128).\n",
526530
"INFO:inference:[config] out_spacing -> (1.0, 1.0, 0.75).\n",
527531
"INFO:root:`controllable_anatomy_size` is not provided.\n",
528532
"INFO:inference:checkpoints ./temp_work_dir/./models/diff_unet_ckpt.pt loaded.\n",
529-
"INFO:inference:scale_factor -> 0.89132159948349.\n",
533+
"INFO:inference:scale_factor -> 0.8903454542160034.\n",
530534
"INFO:inference:num_downsample_level -> 4, divisor -> 4.\n",
531535
"INFO:inference:noise: cuda:0, torch.float32, <class 'torch.Tensor'>\n",
532536
"\n",
533537
" 0%| | 0/10 [00:00<?, ?it/s]\n",
534-
" 10%|███████▍ | 1/10 [00:00<00:02, 3.62it/s]\n",
535-
" 40%|█████████████████████████████▌ | 4/10 [00:00<00:00, 12.53it/s]\n",
536-
" 80%|███████████████████████████████████████████████████████████▏ | 8/10 [00:00<00:00, 19.54it/s]\n",
537-
"100%|█████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 18.16it/s]\n",
538-
"INFO:inference:Saved ./temp_work_dir/./predictions/unet_3d_seed62801_size256x256x128_spacing1.00x1.00x0.75_20240924034721.nii.gz.\n",
538+
" 10%|███████▍ | 1/10 [00:00<00:02, 3.48it/s]\n",
539+
" 40%|█████████████████████████████▌ | 4/10 [00:00<00:00, 12.23it/s]\n",
540+
" 80%|███████████████████████████████████████████████████████████▏ | 8/10 [00:00<00:00, 19.26it/s]\n",
541+
"100%|█████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 17.80it/s]\n",
542+
"INFO:inference:Saved ./temp_work_dir/./predictions/unet_3d_seed93612_size256x256x128_spacing1.00x1.00x0.75_20240930063144_rank0.nii.gz.\n",
539543
"\n"
540544
]
541545
}
@@ -552,6 +556,8 @@
552556
" model_config_filepath,\n",
553557
" \"--model_def\",\n",
554558
" model_def_filepath,\n",
559+
" \"--num_gpus\",\n",
560+
" str(num_gpus),\n",
555561
"]\n",
556562
"\n",
557563
"run_torchrun(module, module_args, num_gpus=num_gpus)\n",
@@ -562,7 +568,7 @@
562568
],
563569
"metadata": {
564570
"kernelspec": {
565-
"display_name": "Python 3 (ipykernel)",
571+
"display_name": "Python 3",
566572
"language": "python",
567573
"name": "python3"
568574
},

0 commit comments

Comments
 (0)