disable debugger when checkpointing is enabled

NihalHarish · NihalHarish · commit fd9dd85901bb · 2021-04-01T16:01:13.000-07:00
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
@@ -2209,7 +2209,20 @@ def _validate_and_set_debugger_configs(self):
         ):
             self.debugger_hook_config = DebuggerHookConfig(s3_output_path=self.output_path)
         elif not self.debugger_hook_config:
-            self.debugger_hook_config = None
+            self.debugger_hook_config = False
+
+        # Disable debugger if checkpointing is enabled by the customer
+        _should_disable_debugger = False
+        if self.checkpoint_s3_uri and self.checkpoint_local_path and self.debugger_hook_config:
+            if self.instance_count > 1:
+                _should_disable_debugger = True
+            if hasattr(self, "distribution") and self.distribution is not None:
+                _should_disable_debugger = True
+        if _should_disable_debugger:
+            logger.info(
+                "SMDebug Does Not Currently Support Distributed Training Jobs With Checkpointing Enabled"
+            )
+            self.debugger_hook_config = False
 
     def _stage_user_code_in_s3(self):
         """Upload the user training script to s3 and return the location.
diff --git a/tests/integ/test_debugger.py b/tests/integ/test_debugger.py
@@ -24,6 +24,7 @@
     TensorBoardOutputConfig,
 )
 from sagemaker.mxnet.estimator import MXNet
+from sagemaker.pytorch.estimator import PyTorch
 from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
 from tests.integ.retry import retries
 from tests.integ.timeout import timeout
@@ -351,6 +352,77 @@ def test_mxnet_with_debugger_hook_config(
         _wait_and_assert_that_no_rule_jobs_errored(training_job=mx.latest_training_job)
 
 
+def test_debug_hook_disabled_with_checkpointing(
+    sagemaker_session,
+    mxnet_training_latest_version,
+    mxnet_training_latest_py_version,
+    cpu_instance_type,
+):
+    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
+        s3_output_path = os.path.join(
+            "s3://", sagemaker_session.default_bucket(), str(uuid.uuid4())
+        )
+        debugger_hook_config = DebuggerHookConfig(
+            s3_output_path=os.path.join(s3_output_path, "tensors")
+        )
+
+        script_path = os.path.join(DATA_DIR, "mxnet_mnist", "mnist_gluon.py")
+
+        # Estimator with checkpointing enabled
+        mx = MXNet(
+            entry_point=script_path,
+            role="SageMakerRole",
+            framework_version=mxnet_training_latest_version,
+            py_version=mxnet_training_latest_py_version,
+            instance_count=1,
+            instance_type=cpu_instance_type,
+            sagemaker_session=sagemaker_session,
+            debugger_hook_config=debugger_hook_config,
+            checkpoint_local_path="/opt/ml/checkpoints",
+            checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"),
+        )
+        mx._prepare_for_training()
+
+        # Debug Hook should be enabled
+        assert mx.debugger_hook_config is not None
+
+        # Estimator with checkpointing enabled and Instance Count>1
+        mx = MXNet(
+            entry_point=script_path,
+            role="SageMakerRole",
+            framework_version=mxnet_training_latest_version,
+            py_version=mxnet_training_latest_py_version,
+            instance_count=2,
+            instance_type=cpu_instance_type,
+            sagemaker_session=sagemaker_session,
+            debugger_hook_config=debugger_hook_config,
+            checkpoint_local_path="/opt/ml/checkpoints",
+            checkpoint_s3_uri=os.path.join(s3_output_path, "checkpoints"),
+        )
+        mx._prepare_for_training()
+        # Debug Hook should be enabled
+        assert mx.debugger_hook_config is False
+
+        # Estimator with checkpointing enabled and Model Parallel Enabled
+        pt = PyTorch(
+            base_job_name="pytorch-smdataparallel-mnist",
+            entry_point=script_path,
+            role="SageMakerRole",
+            framework_version="1.8.0",
+            py_version="py36",
+            instance_count=1,
+            # For training with p3dn instance use - ml.p3dn.24xlarge, with p4dn instance use - ml.p4d.24xlarge
+            instance_type="ml.p3.16xlarge",
+            sagemaker_session=sagemaker_session,
+            # Training using SMDataParallel Distributed Training Framework
+            distribution={"smdistributed": {"dataparallel": {"enabled": True}}},
+            debugger_hook_config=False,
+        )
+        pt._prepare_for_training()
+        # Debug Hook should be enabled
+        assert pt.debugger_hook_config is False
+
+
 def test_mxnet_with_rules_and_debugger_hook_config(
     sagemaker_session,
     mxnet_training_latest_version,