|
60 | 60 | "source": [
|
61 | 61 | "import os\n",
|
62 | 62 | "import tempfile\n",
|
63 |
| - "import torch\n", |
64 | 63 | "\n",
|
65 | 64 | "from monai.bundle.config_parser import ConfigParser\n",
|
66 | 65 | "from monai.apps import download_and_extract\n",
|
67 | 66 | "\n",
|
68 | 67 | "from monai.apps.auto3dseg import AutoRunner\n",
|
69 |
| - "from monai.auto3dseg import datafold_read\n", |
70 | 68 | "from monai.config import print_config\n",
|
71 | 69 | "\n",
|
72 | 70 | "print_config()"
|
73 | 71 | ]
|
74 | 72 | },
|
75 | 73 | {
|
| 74 | + "attachments": {}, |
76 | 75 | "cell_type": "markdown",
|
77 | 76 | "metadata": {},
|
78 | 77 | "source": [
|
79 | 78 | "## Download dataset\n",
|
80 | 79 | "\n",
|
81 |
| - "We provide a toy datalist file that splits a subset of the downloaded datasets into five folds.\n", |
82 |
| - "\n", |
83 |
| - "> NOTE: Each validation set only has 6 images in one fold of training.\n", |
84 |
| - "> Therefore, we need to set a limit on the total number of GPUs we're using in this notebook." |
| 80 | + "We provide a toy datalist file that splits a subset of the downloaded datasets into five folds." |
85 | 81 | ]
|
86 | 82 | },
|
87 | 83 | {
|
|
102 | 98 | "if not os.path.exists(dataroot):\n",
|
103 | 99 | " download_and_extract(resource, compressed_file, root_dir)\n",
|
104 | 100 | "\n",
|
105 |
| - "datalist_file = os.path.join(\"..\", \"tasks\", \"msd\", msd_task, \"msd_\" + msd_task.lower() + \"_folds.json\")\n", |
106 |
| - "\n", |
107 |
| - "if torch.cuda.device_count() > 6:\n", |
108 |
| - " os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", |
109 |
| - " os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0,1,2,3,4,5\"" |
| 101 | + "datalist_file = os.path.join(\"..\", \"tasks\", \"msd\", msd_task, \"msd_\" + msd_task.lower() + \"_folds.json\")" |
110 | 102 | ]
|
111 | 103 | },
|
112 | 104 | {
|
|
267 | 259 | ]
|
268 | 260 | },
|
269 | 261 | {
|
| 262 | + "attachments": {}, |
270 | 263 | "cell_type": "markdown",
|
271 | 264 | "metadata": {},
|
272 | 265 | "source": [
|
273 | 266 | "## Customize training parameters by override the default values\n",
|
274 | 267 | "\n",
|
275 | 268 | "`set_training_params` in `AutoRunner` provides an interface to change all algorithms' training parameters in one line. \n",
|
276 | 269 | "\n",
|
277 |
| - "> NOTE **Auto3DSeg** uses MONAI bundle templates to perform training, validation, and inference. The number of epochs/iterations of training is specified by the config files in each template.\n", |
278 |
| - "> Users can override these these values in the bundle templates.\n", |
279 |
| - "> But users should consider that some bundle templates may use `num_iterations` and other may use `num_epochs` to iterate.\n", |
| 270 | + "NOTE: \n", |
| 271 | + "**Auto3DSeg** uses MONAI bundle templates to perform training, validation, and inference.\n", |
| 272 | + "The number of epochs/iterations of training is specified by the config files in each template.\n", |
| 273 | + "Users can override these these values in the bundle templates.\n", |
| 274 | + "But users should consider that some bundle templates may use `num_iterations` and other may use `num_epochs` to iterate.\n", |
280 | 275 | "\n",
|
281 |
| - "For demo purpose, below is a code block to convert num_epoch to iteration style and override all algorithms with the same training parameters for 1-GPU/2-GPU machine. \n" |
| 276 | + "For demo purposes, below is a code block to convert num_epoch to iteration style and override all algorithms with the same training parameters.\n", |
| 277 | + "The setup works fine for a machine that has GPUs less than or equal to 8.\n", |
| 278 | + "The datalist in this example is only using a subset of the original dataset.\n", |
| 279 | + "Users need to ensure the number of GPUs is not greater than the number that the training dataset can be partitioned.\n", |
| 280 | + "For example, the following code block is not suitable for a 16-GPU system.\n", |
| 281 | + "In such cases, please change the code block accordingly.\n" |
282 | 282 | ]
|
283 | 283 | },
|
284 | 284 | {
|
|
289 | 289 | "source": [
|
290 | 290 | "max_epochs = 2\n",
|
291 | 291 | "\n",
|
292 |
| - "# safeguard to ensure max_epochs is greater or equal to 2\n", |
293 |
| - "max_epochs = max(max_epochs, 2)\n", |
294 |
| - "\n", |
295 |
| - "num_gpus = 1 if \"multigpu\" in input_cfg and not input_cfg[\"multigpu\"] else torch.cuda.device_count()\n", |
296 |
| - "\n", |
297 |
| - "num_epoch = max_epochs\n", |
298 |
| - "num_images_per_batch = 2\n", |
299 |
| - "files_train_fold0, _ = datafold_read(datalist_file, \"\", 0)\n", |
300 |
| - "n_data = len(files_train_fold0)\n", |
301 |
| - "n_iter = int(num_epoch * n_data / num_images_per_batch / num_gpus)\n", |
302 |
| - "n_iter_val = int(n_iter / 2)\n", |
303 |
| - "\n", |
304 | 292 | "train_param = {\n",
|
305 |
| - " \"num_iterations\": n_iter,\n", |
306 |
| - " \"num_iterations_per_validation\": n_iter_val,\n", |
307 |
| - " \"num_images_per_batch\": num_images_per_batch,\n", |
308 |
| - " \"num_epochs\": num_epoch,\n", |
309 |
| - " \"num_warmup_iterations\": n_iter_val,\n", |
| 293 | + " \"num_epochs_per_validation\": 1,\n", |
| 294 | + " \"num_images_per_batch\": 2,\n", |
| 295 | + " \"num_epochs\": max_epochs,\n", |
| 296 | + " \"num_warmup_epochs\": 1,\n", |
310 | 297 | "}\n",
|
| 298 | + "\n", |
311 | 299 | "runner = AutoRunner(input=input)\n",
|
312 | 300 | "runner.set_training_params(params=train_param)\n",
|
313 | 301 | "# runner.run()"
|
|
360 | 348 | ]
|
361 | 349 | },
|
362 | 350 | {
|
| 351 | + "attachments": {}, |
363 | 352 | "cell_type": "markdown",
|
364 | 353 | "metadata": {},
|
365 | 354 | "source": [
|
366 | 355 | "## Train model with HPO\n",
|
367 | 356 | "\n",
|
368 | 357 | "**Auto3DSeg** supports hyper parameter optimization (HPO) via `NNI` and `Optuna` backends.\n",
|
369 |
| - "If you wound like to the use `Optuna`, please check the [notebook](hpo_optuna.ipynb) for detailed usage.\n", |
| 358 | + "If you would like to the use `Optuna`, please check the [notebook](hpo_optuna.ipynb) for detailed usage.\n", |
370 | 359 | "\n",
|
371 | 360 | "Here we demonstrate the HPO option with `NNI` by Microsoft.\n",
|
372 | 361 | "Please install it via `pip install nni` if you hope to execute HPO with it in tutorial and haven't done so in the beginning of the notebook.\n",
|
373 | 362 | "AutoRunner supports `NNI` backend with a grid search method via automatically generating a the `NNI` config and run `nnictl` commands in subprocess.\n",
|
374 | 363 | "\n",
|
375 | 364 | "## Use `AutoRunner` with `NNI` backend to perform grid search\n",
|
376 | 365 | "\n",
|
377 |
| - "After `runner.run()` is executed, `nni` will attempt to start a web service using port 8088 by default. If you are running the tutorial in a remote host, please make sure the port is available on the system.\n", |
| 366 | + "After `runner.run()` is executed, `nni` will attempt to start a web service using port 8088 by default. If you are running the tutorial in a remote host, please ensure the port is available on the system.\n", |
378 | 367 | "\n",
|
379 | 368 | "> NOTE: it is recommended to turn off ensemble if the users are using HPO features.\n",
|
380 | 369 | "> By default, all the models are saved under the working directory, including the ones tuned by the HPO package.\n",
|
381 |
| - "> Users may want to read the HPO results before the taking the next step.\n", |
| 370 | + "> Users may want to read the HPO results before taking the next step.\n", |
382 | 371 | "> If the users want to ensemble all the models, the `ensemble` option can be set to True."
|
383 | 372 | ]
|
384 | 373 | },
|
|
395 | 384 | ]
|
396 | 385 | },
|
397 | 386 | {
|
| 387 | + "attachments": {}, |
398 | 388 | "cell_type": "markdown",
|
399 | 389 | "metadata": {},
|
400 | 390 | "source": [
|
|
403 | 393 | "The default `NNI` config that `AutoRunner` looks like below. User can override some of the parameters via the `set_hpo_params` interface:\n",
|
404 | 394 | "\n",
|
405 | 395 | "```python\n",
|
| 396 | + "import torch\n", |
406 | 397 | "default_nni_config = {\n",
|
407 | 398 | " \"trialCodeDirectory\": \".\",\n",
|
408 | 399 | " \"trialGpuNumber\": torch.cuda.device_count(),\n",
|
|
449 | 440 | "outputs": [],
|
450 | 441 | "source": [
|
451 | 442 | "runner = AutoRunner(input=input, hpo=True, ensemble=False)\n",
|
| 443 | + "num_epoch = 2\n", |
452 | 444 | "hpo_params = {\n",
|
453 | 445 | " \"maxTrialNumber\": 20,\n",
|
454 | 446 | " \"maxExperimentDuration\": \"30m\",\n",
|
455 |
| - " \"num_iterations\": n_iter,\n", |
456 |
| - " \"num_iterations_per_validation\": n_iter_val,\n", |
457 |
| - " \"num_images_per_batch\": num_images_per_batch,\n", |
458 |
| - " \"num_epochs\": num_epoch,\n", |
459 |
| - " \"num_warmup_iterations\": n_iter_val,\n", |
460 |
| - " \"training#num_iterations\": n_iter,\n", |
461 |
| - " \"training#num_iterations_per_validation\": n_iter_val,\n", |
462 |
| - " \"searching#num_iterations\": n_iter,\n", |
463 |
| - " \"searching#num_iterations_per_validation\": n_iter_val,\n", |
464 |
| - " \"searching#num_warmup_iterations\": n_iter,\n", |
| 447 | + " \"num_epochs_per_validation\": 1,\n", |
| 448 | + " \"num_images_per_batch\": 1,\n", |
| 449 | + " \"num_epochs\": 2,\n", |
| 450 | + " \"num_warmup_epochs\": 1,\n", |
| 451 | + " \"training#num_epochs\": 2,\n", |
| 452 | + " \"training#num_epochs_per_validation\": 1,\n", |
| 453 | + " \"searching#num_epochs\": 2,\n", |
| 454 | + " \"searching#num_epochs_per_validation\": 1,\n", |
| 455 | + " \"searching#num_warmup_epochs\": 1,\n", |
465 | 456 | "}\n",
|
466 | 457 | "search_space = {\"learning_rate\": {\"_type\": \"choice\", \"_value\": [0.0001, 0.01]}}\n",
|
467 | 458 | "runner.set_num_fold(num_fold=1)\n",
|
|
0 commit comments