[ReadyForReview] Auto3DSeg DataAnalyzer OOM and other minor issue (#5278)

mingxin-zheng · web-flow · commit bb81a23203eb · 2022-10-11T13:13:46.000+01:00
Fixes #5277 . ### Updated results In my local test env, I have the following results: - The change of GPU memory before/after DataAnalyzer is less than 5MB after the fix. Previously, there are lots of cached PyTorch tensors and CuPy variables that are not released for trainings that takes up to several GBs of GPU mem. - DataAnalyzer can also process larger images now because leaks are fix (3D image with a size 512x512x512 passed for 12GB RTX 3080Ti) ### Description Auto3DSeg DataAnalyzer occupied a large trunk of memory and was unable to release them during the training. The reasons behind are possibly due to: - Training are done by subprocess call, and PyTorch in the subprocess is unable to find the memory pool allocated by the main process - GPU memory leakage ( DataAnalyzer math operations uses torch functions and CuPy) plus test functions need improvements and AutoRunner needs to expose the API call to change device of DataAnalyzer ### Types of changes  - [x] Non-breaking change (fix or new feature that would not break existing functionality). - [ ] Breaking change (fix or new feature that would cause existing functionality to change). - [ ] New tests added to cover the changes. - [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`. - [ ] Quick tests passed locally by running `./runtests.sh --quick --unittests --disttests`. - [ ] In-line docstrings updated. - [ ] Documentation updated, tested `make html` command in the `docs/` folder. Signed-off-by: Mingxin Zheng <18563433+mingxin-zheng@users.noreply.github.com>
diff --git a/.github/workflows/pythonapp-gpu.yml b/.github/workflows/pythonapp-gpu.yml
@@ -23,7 +23,7 @@ jobs:
           - "PT17+CUDA102"
           - "PT18+CUDA102"
           - "PT18+CUDA112"
-          - "PT112+CUDA117"
+          - "PT112+CUDA118"
           - "PT110+CUDA102"
           - "PT112+CUDA102"
         include:
diff --git a/monai/apps/auto3dseg/data_analyzer.py b/monai/apps/auto3dseg/data_analyzer.py
@@ -122,8 +122,8 @@ def __init__(
         output_path: str = "./data_stats.yaml",
         average: bool = True,
         do_ccp: bool = True,
-        device: Union[str, torch.device] = "cuda",
-        worker: int = 0,
+        device: Union[str, torch.device] = "cpu",
+        worker: int = 2,
         image_key: str = "image",
         label_key: Optional[str] = "label",
     ):
@@ -137,13 +137,10 @@ def __init__(
         self.average = average
         self.do_ccp = do_ccp
         self.device = torch.device(device)
-        self.worker = worker
+        self.worker = 0 if (self.device.type == "cuda") else worker
         self.image_key = image_key
         self.label_key = label_key
 
-        if (self.device.type == "cuda") and (worker > 0):
-            raise ValueError("CUDA does not support multiple subprocess. If device is GPU, please set worker to 0")
-
     @staticmethod
     def _check_data_uniformity(keys: List[str], result: Dict):
         """
@@ -232,8 +229,14 @@ def get_all_case_stats(self):
         result[DataStatsKeys.SUMMARY] = summarizer.summarize(result[DataStatsKeys.BY_CASE])
 
         if not self._check_data_uniformity([ImageStatsKeys.SPACING], result):
-            logger.warning("Data is not completely uniform. MONAI transforms may provide unexpected result")
+            logger.warning("data spacing is not completely uniform. MONAI transforms may provide unexpected result")
 
         ConfigParser.export_config_file(result, self.output_path, fmt="yaml", default_flow_style=None)
 
+        del d["image"], d["label"]
+        if self.device.type == "cuda":
+            # release unreferenced tensors to mitigate OOM
+            # limitation: https://github.com/pytorch/pytorch/issues/12873#issuecomment-482916237
+            torch.cuda.empty_cache()
+
         return result
diff --git a/monai/auto3dseg/__init__.py b/monai/auto3dseg/__init__.py
@@ -11,6 +11,7 @@
 
 from .algo_gen import Algo, AlgoGen
 from .analyzer import (
+    Analyzer,
     FgImageStats,
     FgImageStatsSumm,
     FilenameStats,
diff --git a/monai/auto3dseg/analyzer.py b/monai/auto3dseg/analyzer.py
@@ -229,8 +229,10 @@ def __call__(self, data):
         """
         d = dict(data)
         start = time.time()
-        ndas = data[self.image_key]
-        ndas = [ndas[i] for i in range(ndas.shape[0])]
+        restore_grad_state = torch.is_grad_enabled()
+        torch.set_grad_enabled(False)
+
+        ndas = [d[self.image_key][i] for i in range(d[self.image_key].shape[0])]
         if "nda_croppeds" not in d:
             nda_croppeds = [get_foreground_image(nda) for nda in ndas]
 
@@ -250,8 +252,10 @@ def __call__(self, data):
         if not verify_report_format(report, self.get_report_format()):
             raise RuntimeError(f"report generated by {self.__class__} differs from the report format.")
 
-        logger.debug(f"Get image stats spent {time.time()-start}")
         d[self.stats_name] = report
+
+        torch.set_grad_enabled(restore_grad_state)
+        logger.debug(f"Get image stats spent {time.time()-start}")
         return d
 
 
@@ -307,9 +311,11 @@ def __call__(self, data) -> dict:
         """
 
         d = dict(data)
+        start = time.time()
+        restore_grad_state = torch.is_grad_enabled()
+        torch.set_grad_enabled(False)
 
-        ndas = d[self.image_key]  # (1,H,W,D) or (C,H,W,D)
-        ndas = [ndas[i] for i in range(ndas.shape[0])]
+        ndas = [d[self.image_key][i] for i in range(d[self.image_key].shape[0])]
         ndas_label = d[self.label_key]  # (H,W,D)
         nda_foregrounds = [get_foreground_label(nda, ndas_label) for nda in ndas]
 
@@ -324,6 +330,9 @@ def __call__(self, data) -> dict:
             raise RuntimeError(f"report generated by {self.__class__} differs from the report format.")
 
         d[self.stats_name] = report
+
+        torch.set_grad_enabled(restore_grad_state)
+        logger.debug(f"Get foreground image stats spent {time.time()-start}")
         return d
 
 
@@ -423,9 +432,12 @@ def __call__(self, data):
             functions. If the input has nan/inf, the stats results will be nan/inf.
         """
         d = dict(data)
+        start = time.time()
+        using_cuda = True if d[self.image_key].device.type == "cuda" else False
+        restore_grad_state = torch.is_grad_enabled()
+        torch.set_grad_enabled(False)
 
-        ndas = d[self.image_key]  # (1,H,W,D) or (C,H,W,D)
-        ndas = [ndas[i] for i in range(ndas.shape[0])]
+        ndas = [d[self.image_key][i] for i in range(d[self.image_key].shape[0])]
         ndas_label = d[self.label_key]  # (H,W,D)
         nda_foregrounds = [get_foreground_label(nda, ndas_label) for nda in ndas]
 
@@ -435,7 +447,6 @@ def __call__(self, data):
 
         unique_label = unique_label.astype(np.int8).tolist()
 
-        start = time.time()
         label_substats = []  # each element is one label
         pixel_sum = 0
         pixel_arr = []
@@ -444,13 +455,20 @@ def __call__(self, data):
             label_dict: Dict[str, Any] = {}
             mask_index = ndas_label == index
 
+            nda_masks = [nda[mask_index] for nda in ndas]
             label_dict[LabelStatsKeys.IMAGE_INTST] = [
-                self.ops[LabelStatsKeys.IMAGE_INTST].evaluate(nda[mask_index]) for nda in ndas
+                self.ops[LabelStatsKeys.IMAGE_INTST].evaluate(nda_m) for nda_m in nda_masks
             ]
+
             pixel_count = sum(mask_index)
             pixel_arr.append(pixel_count)
             pixel_sum += pixel_count
             if self.do_ccp:  # apply connected component
+                if using_cuda:
+                    # The back end of get_label_ccp is CuPy
+                    # which is unable to automatically release CUDA GPU memory held by PyTorch
+                    del nda_masks
+                    torch.cuda.empty_cache()
                 shape_list, ncomponents = get_label_ccp(mask_index)
                 label_dict[LabelStatsKeys.LABEL_SHAPE] = shape_list
                 label_dict[LabelStatsKeys.LABEL_NCOMP] = ncomponents
@@ -472,6 +490,8 @@ def __call__(self, data):
             raise RuntimeError(f"report generated by {self.__class__} differs from the report format.")
 
         d[self.stats_name] = report
+
+        torch.set_grad_enabled(restore_grad_state)
         logger.debug(f"Get label stats spent {time.time()-start}")
         return d
 
diff --git a/monai/auto3dseg/seg_summarizer.py b/monai/auto3dseg/seg_summarizer.py
@@ -104,7 +104,7 @@ def add_analyzer(self, case_analyzer, summary_analyzer) -> None:
 
             .. code-block:: python
 
-                from monai.auto3dseg.analyzer import Analyzer
+                from monai.auto3dseg import Analyzer
                 from monai.auto3dseg.utils import concat_val_to_np
                 from monai.auto3dseg.analyzer_engine import SegSummarizer
 
diff --git a/monai/auto3dseg/utils.py b/monai/auto3dseg/utils.py
@@ -106,6 +106,9 @@ def get_label_ccp(mask_index: MetaTensor, use_gpu: bool = True) -> Tuple[List[An
             shape_list.append(bbox_shape)
         ncomponents = len(vals)
 
+        del mask_cupy, labeled, vals, comp_idx, ncomp
+        cp.get_default_memory_pool().free_all_blocks()
+
     elif has_measure:
         labeled, ncomponents = measure_np.label(mask_index.data.cpu().numpy(), background=-1, return_num=True)
         for ncomp in range(1, ncomponents + 1):
@@ -174,7 +177,7 @@ def concat_val_to_np(
     elif ragged:
         return np.concatenate(np_list, **kwargs)  # type: ignore
     else:
-        return np.concatenate([np_list], **kwargs)
+        return np.concatenate([np_list], **kwargs)  # type: ignore
 
 
 def concat_multikeys_to_dict(
diff --git a/tests/test_auto3dseg.py b/tests/test_auto3dseg.py
diff --git a/tests/test_cv2_dist.py b/tests/test_cv2_dist.py