Skip to content

Commit a7e0ed1

Browse files
committed
[DLMED] simplify to override
Signed-off-by: Nic Ma <[email protected]>
1 parent d80a4da commit a7e0ed1

File tree

3 files changed

+37
-3
lines changed

3 files changed

+37
-3
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"device": "$torch.device(f'cuda:{dist.get_rank()}')",
3+
"network": {
4+
"_target_": "torch.nn.parallel.DistributedDataParallel",
5+
"module": "$@network_def.to(@device)",
6+
"device_ids": ["@device"]
7+
},
8+
"train#sampler": {
9+
"_target_": "DistributedSampler",
10+
"dataset": "@train#dataset",
11+
"even_divisible": true,
12+
"shuffle": true
13+
},
14+
"train#dataloader#sampler": "@train#sampler",
15+
"train#dataloader#shuffle": false,
16+
"train#trainer#train_handlers": "$@train#handlers[: 1 if dist.get_rank() > 0 else None]",
17+
"validate#sampler": {
18+
"_target_": "DistributedSampler",
19+
"dataset": "@validate#dataset",
20+
"even_divisible": false,
21+
"shuffle": false
22+
},
23+
"validate#dataloader#sampler": "@validate#sampler",
24+
"validate#evaluator#val_handlers": "$None if dist.get_rank() > 0 else @validate#handlers",
25+
"training": [
26+
"$import torch.distributed as dist",
27+
"$dist.init_process_group(backend='nccl')",
28+
"$torch.cuda.set_device(@device)",
29+
"$monai.utils.set_determinism(seed=123)",
30+
"$setattr(torch.backends.cudnn, 'benchmark', True)",
31+
"$@train#trainer.run()",
32+
"$dist.destroy_process_group()"
33+
]
34+
}

modules/bundles/spleen_segmentation/configs/train_multi_gpu.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@
250250
"postprocessing": "@validate#postprocessing",
251251
"key_val_metric": "@validate#key_metric",
252252
"additional_metrics": "@validate#additional_metrics",
253-
"val_handlers": "$@validate#handlers if dist.get_rank() > 0 else None",
253+
"val_handlers": "$None if dist.get_rank() > 0 else @validate#handlers",
254254
"amp": true
255255
}
256256
},

modules/bundles/spleen_segmentation/docs/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@ Execute training:
2929
python -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf
3030
```
3131

32-
Execute multi-GPU training:
32+
Override the train config to execute multi-GPU training:
3333

3434
```
35-
torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train_multi_gpu.json --logging_file configs/logging.conf
35+
torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run training --meta_file configs/metadata.json --config_file configs/train.json --logging_file configs/logging.conf --args_file configs/multi_gpu_train.json
3636
```
3737

3838
Execute inference:

0 commit comments

Comments
 (0)