TorchIO download data to MONAI_DATA_DIRECTORY (#686)

rijobro · web-flow · commit cf3df67ef3f3 · 2022-05-04T16:03:10.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -138,3 +138,13 @@ tests/testing_data/*Hippocampus*
 # Ignore torch saves
 */torch/runs
 logs
+*/runs
+lightning_logs
+
+# ignore automatically created files
+*.ts
+nohup.out
+deepgrow/ignite/_image.nii.gz
+*.zip
+deployment/bentoml/mednist_classifier_bentoml.py
+deployment/ray/mednist_classifier_start.py
diff --git a/modules/TorchIO_MONAI_PyTorch_Lightning.ipynb b/modules/TorchIO_MONAI_PyTorch_Lightning.ipynb
@@ -53,7 +53,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -69,7 +69,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -78,7 +78,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {
     "id": "KvbbZuhmquRR"
    },
@@ -92,14 +92,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {
     "id": "gduPdIturUIB"
    },
    "outputs": [],
    "source": [
-    "from pathlib import Path\n",
     "from datetime import datetime\n",
+    "import os\n",
+    "import tempfile\n",
+    "from glob import glob\n",
     "\n",
     "import torch\n",
     "from torch.utils.data import random_split, DataLoader\n",
@@ -117,6 +119,36 @@
     "%load_ext tensorboard"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup data directory\n",
+    "\n",
+    "You can specify a directory with the `MONAI_DATA_DIRECTORY` environment variable.  \n",
+    "This allows you to save results and reuse downloads.  \n",
+    "If not specified a temporary directory will be used."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/mnt/data/rbrown/Documents/Data/MONAI\n"
+     ]
+    }
+   ],
+   "source": [
+    "directory = os.environ.get(\"MONAI_DATA_DIRECTORY\")\n",
+    "root_dir = tempfile.mkdtemp() if directory is None else directory\n",
+    "print(root_dir)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -145,20 +177,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {
     "id": "KuhTaRl3vf37"
    },
    "outputs": [],
    "source": [
-    "\n",
-    "\n",
     "class MedicalDecathlonDataModule(pl.LightningDataModule):\n",
     "    def __init__(self, task, batch_size, train_val_ratio):\n",
     "        super().__init__()\n",
     "        self.task = task\n",
     "        self.batch_size = batch_size\n",
-    "        self.dataset_dir = Path(task)\n",
+    "        self.base_dir = root_dir\n",
+    "        self.dataset_dir = os.path.join(root_dir, task)\n",
     "        self.train_val_ratio = train_val_ratio\n",
     "        self.subjects = None\n",
     "        self.test_subjects = None\n",
@@ -175,16 +206,13 @@
     "        return shapes.max(axis=0)\n",
     "\n",
     "    def download_data(self):\n",
-    "        if not self.dataset_dir.is_dir():\n",
-    "            url = 'https://msd-for-monai.s3-us-west-2.amazonaws.com/Task04_Hippocampus.tar'\n",
-    "            monai.apps.download_and_extract(url=url, output_dir=\".\")\n",
+    "        if not os.path.isdir(self.dataset_dir):\n",
+    "            url = f'https://msd-for-monai.s3-us-west-2.amazonaws.com/{self.task}.tar'\n",
+    "            monai.apps.download_and_extract(url=url, output_dir=self.base_dir)\n",
     "\n",
-    "        def get_niis(d):\n",
-    "            return sorted(p for p in d.glob('*.nii*') if not p.name.startswith('.'))\n",
-    "\n",
-    "        image_training_paths = get_niis(self.dataset_dir / 'imagesTr')\n",
-    "        label_training_paths = get_niis(self.dataset_dir / 'labelsTr')\n",
-    "        image_test_paths = get_niis(self.dataset_dir / 'imagesTs')\n",
+    "        image_training_paths = sorted(glob(os.path.join(self.dataset_dir, 'imagesTr', \"*.nii*\")))\n",
+    "        label_training_paths = sorted(glob(os.path.join(self.dataset_dir, 'labelsTr', \"*.nii*\")))\n",
+    "        image_test_paths = sorted(glob(os.path.join(self.dataset_dir, 'imagesTs', \"*.nii*\")))\n",
     "        return image_training_paths, label_training_paths, image_test_paths\n",
     "\n",
     "    def prepare_data(self):\n",
@@ -260,7 +288,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {
     "id": "hcHf9w2nLfyC"
    },
@@ -284,7 +312,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
@@ -293,16 +321,6 @@
     "outputId": "7cb39051-4c26-4811-b838-8a5e938e53a3"
    },
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Downloading...\n",
-      "From: https://drive.google.com/uc?id=1RzPB1_bqzQhlWvU-YGvZzhx2omcDh38C\n",
-      "To: /content/Task04_Hippocampus.tar\n",
-      "28.4MB [00:00, 82.8MB/s]\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -341,7 +359,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {
     "id": "1Ov3H12p6Qx1"
    },
@@ -395,7 +413,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"