Skip to content

Commit b8cb649

Browse files
authored
Merge branch 'master' into fix-typo-fast-training
2 parents c6c7e7c + ff586e0 commit b8cb649

34 files changed

+2001
-67
lines changed

.gitignore

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,3 +138,13 @@ tests/testing_data/*Hippocampus*
138138
# Ignore torch saves
139139
*/torch/runs
140140
logs
141+
*/runs
142+
lightning_logs
143+
144+
# ignore automatically created files
145+
*.ts
146+
nohup.out
147+
deepgrow/ignite/_image.nii.gz
148+
*.zip
149+
deployment/bentoml/mednist_classifier_bentoml.py
150+
deployment/ray/mednist_classifier_start.py

3d_classification/densenet_training_array.ipynb

Lines changed: 59 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -131,40 +131,17 @@
131131
}
132132
],
133133
"source": [
134-
"# set this in your environment or previous cell to wherever IXI is downloaded and extracted\n",
134+
"# Set data directory\n",
135135
"directory = os.environ.get(\"MONAI_DATA_DIRECTORY\")\n",
136-
"\n",
137-
"if directory is None:\n",
138-
" resource = \"http://biomedic.doc.ic.ac.uk/brain-development/downloads/IXI/IXI-T1.tar\"\n",
139-
" md5 = \"34901a0593b41dd19c1a1f746eac2d58\"\n",
140-
"\n",
141-
" root_dir = tempfile.mkdtemp()\n",
142-
"\n",
143-
" dataset_dir = os.path.join(root_dir, \"ixi\")\n",
144-
" tarfile_name = f\"{dataset_dir}.tar\"\n",
145-
"\n",
146-
" download_and_extract(resource, tarfile_name, dataset_dir, md5)\n",
147-
"else:\n",
148-
" root_dir = directory\n",
149-
"\n",
136+
"root_dir = tempfile.mkdtemp() if directory is None else directory\n",
150137
"print(root_dir)"
151138
]
152139
},
153140
{
154141
"cell_type": "code",
155142
"execution_count": 4,
156143
"metadata": {},
157-
"outputs": [
158-
{
159-
"name": "stdout",
160-
"output_type": "stream",
161-
"text": [
162-
"<class 'torch.Tensor'> torch.Size([3, 1, 96, 96, 96]) tensor([[1., 0.],\n",
163-
" [1., 0.],\n",
164-
" [1., 0.]]) torch.Size([3, 2])\n"
165-
]
166-
}
167-
],
144+
"outputs": [],
168145
"source": [
169146
"# IXI dataset as a demo, downloadable from https://brain-development.org/ixi-dataset/\n",
170147
"images = [\n",
@@ -195,8 +172,58 @@
195172
"\n",
196173
"# Represent labels in one-hot format for binary classifier training,\n",
197174
"# BCEWithLogitsLoss requires target to have same shape as input\n",
198-
"labels = torch.nn.functional.one_hot(torch.as_tensor(labels)).float()\n",
175+
"labels = torch.nn.functional.one_hot(torch.as_tensor(labels)).float()"
176+
]
177+
},
178+
{
179+
"cell_type": "code",
180+
"execution_count": 5,
181+
"metadata": {},
182+
"outputs": [
183+
{
184+
"name": "stderr",
185+
"output_type": "stream",
186+
"text": [
187+
"ixi.tar: 100%|██████████| 4.51G/4.51G [08:19<00:00, 9.70MB/s] \n"
188+
]
189+
},
190+
{
191+
"name": "stdout",
192+
"output_type": "stream",
193+
"text": [
194+
"2022-05-04 12:23:06,530 - INFO - Downloaded: /mnt/data/rbrown/Documents/Data/MONAI/ixi.tar\n",
195+
"2022-05-04 12:23:13,734 - INFO - Verified 'ixi.tar', md5: 34901a0593b41dd19c1a1f746eac2d58.\n",
196+
"2022-05-04 12:23:13,735 - INFO - Writing into directory: /mnt/data/rbrown/Documents/Data/MONAI/ixi.\n"
197+
]
198+
}
199+
],
200+
"source": [
201+
"if not os.path.isfile(images[0]):\n",
202+
" resource = \"http://biomedic.doc.ic.ac.uk/brain-development/downloads/IXI/IXI-T1.tar\"\n",
203+
" md5 = \"34901a0593b41dd19c1a1f746eac2d58\"\n",
199204
"\n",
205+
" dataset_dir = os.path.join(root_dir, \"ixi\")\n",
206+
" tarfile_name = f\"{dataset_dir}.tar\"\n",
207+
"\n",
208+
" download_and_extract(resource, tarfile_name, dataset_dir, md5)"
209+
]
210+
},
211+
{
212+
"cell_type": "code",
213+
"execution_count": 6,
214+
"metadata": {},
215+
"outputs": [
216+
{
217+
"name": "stdout",
218+
"output_type": "stream",
219+
"text": [
220+
"<class 'torch.Tensor'> torch.Size([3, 1, 96, 96, 96]) tensor([[1., 0.],\n",
221+
" [1., 0.],\n",
222+
" [1., 0.]]) torch.Size([3, 2])\n"
223+
]
224+
}
225+
],
226+
"source": [
200227
"# Define transforms\n",
201228
"train_transforms = Compose([ScaleIntensity(), AddChannel(), Resize((96, 96, 96)), RandRotate90(), EnsureType()])\n",
202229
"\n",
@@ -220,7 +247,7 @@
220247
},
221248
{
222249
"cell_type": "code",
223-
"execution_count": 5,
250+
"execution_count": 7,
224251
"metadata": {},
225252
"outputs": [
226253
{
@@ -367,7 +394,7 @@
367394
},
368395
{
369396
"cell_type": "code",
370-
"execution_count": 6,
397+
"execution_count": 8,
371398
"metadata": {},
372399
"outputs": [],
373400
"source": [
@@ -395,7 +422,7 @@
395422
},
396423
{
397424
"cell_type": "code",
398-
"execution_count": 7,
425+
"execution_count": 9,
399426
"metadata": {},
400427
"outputs": [],
401428
"source": [
@@ -405,7 +432,7 @@
405432
},
406433
{
407434
"cell_type": "code",
408-
"execution_count": 8,
435+
"execution_count": 10,
409436
"metadata": {
410437
"scrolled": true,
411438
"tags": []
@@ -462,7 +489,7 @@
462489
},
463490
{
464491
"cell_type": "code",
465-
"execution_count": 9,
492+
"execution_count": 11,
466493
"metadata": {},
467494
"outputs": [],
468495
"source": [
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Ignore the following files/folders during docker build
2+
3+
__pycache__/
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# IDE
2+
.idea/
3+
4+
# artifacts
5+
poc/
6+
*.pyc
7+
result_*
8+
*.pth
9+
logs
10+
11+
# example data
12+
*preprocessed*
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# use python base image
2+
FROM python:3.8.10
3+
ENV DEBIAN_FRONTEND noninteractive
4+
5+
# specify the server FQDN as commandline argument
6+
ARG server_fqdn
7+
RUN echo "Setting up FL workspace wit FQDN: ${server_fqdn}"
8+
9+
# add your code to container
10+
COPY code /code
11+
12+
# add code to path
13+
ENV PYTHONPATH=${PYTHONPATH}:"/code"
14+
15+
# install dependencies
16+
# RUN python -m pip install --upgrade pip
17+
RUN pip3 install tensorboard sklearn torchvision
18+
RUN pip3 install monai==0.8.1
19+
RUN pip3 install nvflare==2.0.16
20+
21+
# mount nvflare from source
22+
#RUN pip install tenseal
23+
#WORKDIR /code
24+
#RUN git clone https://github.com/NVIDIA/NVFlare.git
25+
#ENV PYTHONPATH=${PYTHONPATH}:"/code/NVFlare"
26+
27+
# download pretrained weights
28+
ENV TORCH_HOME=/opt/torch
29+
RUN python3 /code/pt/utils/download_model.py --model_url=https://download.pytorch.org/models/resnet18-f37072fd.pth
30+
31+
# prepare FL workspace
32+
WORKDIR /code
33+
RUN sed -i "s|{SERVER_FQDN}|${server_fqdn}|g" fl_project.yml
34+
RUN python3 -m nvflare.lighter.provision -p fl_project.yml
35+
RUN cp -r workspace/fl_project/prod_00 fl_workspace
36+
RUN mv fl_workspace/${server_fqdn} fl_workspace/server
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
## MammoFL_MICCAI2022
2+
3+
Reference implementation for
4+
[ACR-NVIDIA-NCI Breast Density FL challenge](http://BreastDensityFL.acr.org).
5+
6+
Held in conjunction with [MICCAI 2022](https://conferences.miccai.org/2022/en/).
7+
8+
9+
------------------------------------------------
10+
## 1. Run Training using [NVFlare](https://github.com/NVIDIA/NVFlare) reference implementation
11+
12+
We provide a minimal example of how to implement Federated Averaging using [NVFlare 2.0](https://github.com/NVIDIA/NVFlare) and [MONAI](https://monai.io/) to train
13+
a breast density prediction model with ResNet18.
14+
15+
### 1.1 Download example data
16+
Follow the steps described in [./data/README.md](./data/README.md) to download an example breast density mammography dataset.
17+
Note, the data used in the actual challenge will be different. We do however follow the same preprocessing steps and
18+
use the same four BI-RADS breast density classes for prediction, See [./code/pt/utils/preprocess_dicomdir.py](./code/pt/utils/preprocess_dicomdir.py) for details.
19+
20+
We provide a set of random data splits. Please download them using
21+
```
22+
python3 ./code/pt/utils/download_datalists_and_predictions.py
23+
```
24+
After download, they will be available as `./data/dataset_blinded_site-*.json` which follows the same format as what
25+
will be used in the challenge.
26+
Please do not modify the data list filenames in the configs as they will be the same during the challenge.
27+
28+
Note, the location of the dataset and data lists will be given by the system.
29+
Do not change the locations given in [config_fed_client.json](./code/configs/mammo_fedavg/config/config_fed_client.json):
30+
```
31+
"DATASET_ROOT": "/data/preprocessed",
32+
"DATALIST_PREFIX": "/data/dataset_blinded_",
33+
```
34+
35+
### 1.2 Build container
36+
The argument specifies the FQDN (Fully Qualified Domain Name) of the FL server. Use `localhost` when simulating FL on your machine.
37+
```
38+
./build_docker.sh localhost
39+
```
40+
Note, all code and pretrained models need to be included in the docker image.
41+
The virtual machines running the containers will not have public internet access during training.
42+
For an example, please see the `download_model.py` used to download ImageNet pretrained weights in this example.
43+
44+
The Dockerfile will be submitted using the [MedICI platform](https://www.medici-challenges.org).
45+
For detailed instructions, see the [challenge website](http://BreastDensityFL.acr.org).
46+
47+
### 1.3 Run server and clients containers, and start training
48+
Run all commands at once using. Note this will also create separate logs under `./logs`
49+
```
50+
./run_all_fl.sh
51+
```
52+
Note, the GPU index to use for each client is specified inside `run_all_fl.sh`.
53+
See the individual `run_docker_site-*.sh` commands described below.
54+
Note, the server script will automatically kill all running container used in this example
55+
and final results will be placed under `./result_server`.
56+
57+
(optional) Run each command in a separate terminals to get site-specific printouts in separate windows.
58+
59+
The argument for each shell script specifies the GPU index to be used.
60+
```
61+
./run_docker_server.sh
62+
./run_docker_site-1.sh 0
63+
./run_docker_site-2.sh 1
64+
./run_docker_site-3.sh 0
65+
```
66+
67+
### 1.4 (Optional) Visualize training using TensorBoard
68+
After training completed, the training curves can be visualized using
69+
```
70+
tensorboard --logdir=./result_server
71+
```
72+
A visualization of the global accuracy and [Kappa](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html) validation scores for each site with the provided example data is shown below.
73+
The current setup runs on a machine with two NVIDIA GPUs with 12GB memory each.
74+
The runtime for this experiment is about 45 minutes.
75+
You can adjust the argument to the `run_docker_site-*.sh` scripts to specify different
76+
GPU indices if needed in your environment.
77+
78+
![](./figs/example_data_val_global_acc_kappa.png)
79+
80+
### 1.5 (Optional) Kill all containers
81+
If you didn't use `run_all_fl.sh`, all containers can be killed by running
82+
```
83+
docker kill server site-1 site-2 site-3
84+
```
85+
86+
87+
------------------------------------------------
88+
## 2. Modify the FL algorithm
89+
90+
You can modify and extend the provided example code under [./code/pt](./code/pt).
91+
92+
You could use other components available at [NVFlare](https://github.com/NVIDIA/NVFlare)
93+
or enhance the training pipeline using your custom code or features of other libraries.
94+
95+
See the [NVFlare examples](https://github.com/NVIDIA/NVFlare/tree/main/examples) for features that could be utilized in this challenge.
96+
97+
### 2.1 Debugging the learning algorithm
98+
99+
The example NVFlare `Learner` class is implemented at [./code/pt/learners/mammo_learner.py](./code/pt/learners/mammo_learner.py).
100+
You can debug the file using the `MockClientEngine` as shown in the script by running
101+
```
102+
python3 code/pt/learners/mammo_learner.py
103+
```
104+
Furthermore, you can test it inside the container, by first running
105+
```
106+
./run_docker_debug.sh
107+
```
108+
Note, set `inside_container = True` to reflect the changed filepaths inside the container.
109+
110+
111+
------------------------------------------------
112+
## 3. Bring your own FL framework
113+
If you would like to use your own FL framework to participate in the challenge,
114+
please modify the Dockerfile accordingly to include all the dependencies.
115+
116+
Your container needs to provide the following scripts that implement the starting of server, clients, and finalizing of the server.
117+
They will be executed by the system in the following order.
118+
119+
### 3.1 start server
120+
```
121+
/code/start_server.sh
122+
```
123+
124+
### 3.2 start each client (in parallel)
125+
```
126+
/code/start_site-1.sh
127+
/code/start_site-2.sh
128+
/code/start_site-3.sh
129+
```
130+
131+
### 3.3 finalize the server
132+
```
133+
/code/finalize_server.sh
134+
```
135+
For an example on how the challenge system will execute these commands, see the provided `run_docker*.sh` scripts.
136+
137+
### 3.4 Communication
138+
The communication channels for FL will be restricted to the ports specified in [fl_project.yml](./code/fl_project.yml).
139+
Your FL framework will also need those ports for implementing the communication.
140+
141+
### 3.5 Results
142+
Results will need to be written to `/result/predictions.json`.
143+
Please follow the format produced by the reference implementation at [./result_server_example/predictions.json](./result_server_example/predictions.json)
144+
(available after running `python3 ./code/pt/utils/download_datalists_and_predictions.py`)
145+
The code is expected to return a json file containing at least list of image names and prediction probabilities for each breast density class
146+
for the global model (should be named `SRV_best_FL_global_model.pt`).
147+
```
148+
{
149+
"site-1": {
150+
"SRV_best_FL_global_model.pt": {
151+
...
152+
"test_probs": [{
153+
"image": "Calc-Test_P_00643_LEFT_MLO.npy",
154+
"probs": [0.005602597258985043, 0.7612965703010559, 0.23040543496608734, 0.0026953918859362602]
155+
}, {
156+
...
157+
},
158+
"site-2": {
159+
"SRV_best_FL_global_model.pt": {
160+
...
161+
"test_probs": [{
162+
"image": "Calc-Test_P_00643_LEFT_MLO.npy",
163+
"probs": [0.005602597258985043, 0.7612965703010559, 0.23040543496608734, 0.0026953918859362602]
164+
}, {
165+
...
166+
},
167+
"site-3": {
168+
"SRV_best_FL_global_model.pt": {
169+
...
170+
"test_probs": [{
171+
"image": "Calc-Test_P_00643_LEFT_MLO.npy",
172+
"probs": [0.005602597258985043, 0.7612965703010559, 0.23040543496608734, 0.0026953918859362602]
173+
}, {
174+
...
175+
}
176+
```

0 commit comments

Comments
 (0)