fix: Disable debugger when checkpointing is enabled with distributed training (#2264)

NihalHarish · web-flow · commit 709bb182578c · 2021-04-02T14:48:48.000-07:00
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
@@ -2219,7 +2219,21 @@ def _validate_and_set_debugger_configs(self):
         ):
             self.debugger_hook_config = DebuggerHookConfig(s3_output_path=self.output_path)
         elif not self.debugger_hook_config:
-            self.debugger_hook_config = None
+            # set hook config to False if _region_supports_debugger is False
+            self.debugger_hook_config = False
+
+        # Disable debugger if checkpointing is enabled by the customer
+        if self.checkpoint_s3_uri and self.checkpoint_local_path and self.debugger_hook_config:
+            if self._framework_name in {"mxnet", "pytorch", "tensorflow"}:
+                if self.instance_count > 1 or (
+                    hasattr(self, "distribution")
+                    and self.distribution is not None  # pylint: disable=no-member
+                ):
+                    logger.info(
+                        "SMDebug Does Not Currently Support \
+                        Distributed Training Jobs With Checkpointing Enabled"
+                    )
+                    self.debugger_hook_config = False
 
     def _stage_user_code_in_s3(self):
         """Upload the user training script to s3 and return the location.
diff --git a/src/sagemaker/tensorflow/estimator.py b/src/sagemaker/tensorflow/estimator.py
@@ -18,7 +18,6 @@
 from packaging import version
 
 from sagemaker import image_uris, s3, utils
-from sagemaker.debugger import DebuggerHookConfig
 from sagemaker.deprecations import renamed_kwargs
 from sagemaker.estimator import Framework
 import sagemaker.fw_utils as fw
@@ -347,6 +346,7 @@ def _validate_and_set_debugger_configs(self):
 
         Else, set default HookConfig
         """
+        super(TensorFlow, self)._validate_and_set_debugger_configs()
         ps_enabled = "parameter_server" in self.distribution and self.distribution[
             "parameter_server"
         ].get("enabled", False)
@@ -358,11 +358,6 @@ def _validate_and_set_debugger_configs(self):
                 )
             self.debugger_hook_config = None
             self.debugger_rule_configs = None
-        elif self.debugger_hook_config is None and fw._region_supports_debugger(
-            self.sagemaker_session.boto_session.region_name
-        ):
-            # Set defaults for debugging.
-            self.debugger_hook_config = DebuggerHookConfig(s3_output_path=self.output_path)
 
     def transformer(
         self,
diff --git a/tests/integ/test_debugger.py b/tests/integ/test_debugger.py
@@ -24,6 +24,9 @@
     TensorBoardOutputConfig,
 )
 from sagemaker.mxnet.estimator import MXNet
+from sagemaker.pytorch.estimator import PyTorch
+from sagemaker.tensorflow.estimator import TensorFlow
+from sagemaker.xgboost.estimator import XGBoost
 from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
 from tests.integ.retry import retries
 from tests.integ.timeout import timeout
@@ -351,6 +354,115 @@ def test_mxnet_with_debugger_hook_config(
         _wait_and_assert_that_no_rule_jobs_errored(training_job=mx.latest_training_job)
 
 
+def test_debug_hook_disabled_with_checkpointing(
+    sagemaker_session,
+    mxnet_training_latest_version,
+    mxnet_training_latest_py_version,
+    cpu_instance_type,
+):
+    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
+        s3_output_path = os.path.join(
+            "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4())
+        )
+        debugger_hook_config = DebuggerHookConfig(
+            s3_output_path=os.path.join(s3_output_path, "tensors")
+        )
+
+        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
+
+        # Estimator with checkpointing enabled
+        mx = MXNet(
+            entry_point=script_path,
+            role="SageMakerRole",
+            framework_version=mxnet_training_latest_version,
+            py_version=mxnet_training_latest_py_version,
+            instance_count=1,
+            instance_type=cpu_instance_type,
+            sagemaker_session=sagemaker_session,
+            debugger_hook_config=debugger_hook_config,
+            checkpoint_local_path="/opt/ml/checkpoints",
+            checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"),
+        )
+        mx._prepare_for_training()
+
+        # Debug Hook should be enabled
+        assert mx.debugger_hook_config is not None
+
+        # Estimator with checkpointing enabled and Instance Count>1
+        mx = MXNet(
+            entry_point=script_path,
+            role="SageMakerRole",
+            framework_version=mxnet_training_latest_version,
+            py_version=mxnet_training_latest_py_version,
+            instance_count=2,
+            instance_type=cpu_instance_type,
+            sagemaker_session=sagemaker_session,
+            debugger_hook_config=debugger_hook_config,
+            checkpoint_local_path="/opt/ml/checkpoints",
+            checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"),
+        )
+        mx._prepare_for_training()
+        # Debug Hook should be disabled
+        assert mx.debugger_hook_config is False
+
+        # Estimator with checkpointing enabled and SMDataParallel Enabled
+        pt = PyTorch(
+            base_job_name="pytorch-smdataparallel-mnist",
+            entry_point=script_path,
+            role="SageMakerRole",
+            framework_version="1.8.0",
+            py_version="py36",
+            instance_count=1,
+            # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
+            instance_type="ml.p3.16xlarge",
+            sagemaker_session=sagemaker_session,
+            # Training using SMDataParallel Distributed Training Framework
+            distribution={"smdistributed": {"dataparallel": {"enabled": True}}},
+            checkpoint_local_path="/opt/ml/checkpoints",
+            checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"),
+        )
+        pt._prepare_for_training()
+        # Debug Hook should be disabled
+        assert pt.debugger_hook_config is False
+
+        # Estimator with checkpointing enabled and SMModelParallel Enabled
+        tf = TensorFlow(
+            base_job_name="tf-smdataparallel-mnist",
+            entry_point=script_path,
+            role="SageMakerRole",
+            framework_version="2.4.1",
+            py_version="py36",
+            instance_count=1,
+            # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
+            instance_type="ml.p3.16xlarge",
+            sagemaker_session=sagemaker_session,
+            # Training using SMDataParallel Distributed Training Framework
+            distribution={"smdistributed": {"modelparallel": {"enabled": True}}},
+            checkpoint_local_path="/opt/ml/checkpoints",
+            checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"),
+        )
+        tf._prepare_for_training()
+        # Debug Hook should be disabled
+        assert tf.debugger_hook_config is False
+
+        # Estimator with checkpointing enabled with Xgboost Estimator
+        xg = XGBoost(
+            base_job_name="test_xgboost",
+            entry_point=script_path,
+            role="SageMakerRole",
+            framework_version="1.2-1",
+            py_version="py3",
+            instance_count=2,
+            # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
+            instance_type="ml.p3.16xlarge",
+            sagemaker_session=sagemaker_session,
+            # Training using SMDataParallel Distributed Training Framework
+        )
+        xg._prepare_for_training()
+        # Debug Hook should be enabled
+        assert xg.debugger_hook_config is not None
+
+
 def test_mxnet_with_rules_and_debugger_hook_config(
     sagemaker_session,
     mxnet_training_latest_version,