aws · ahsan-z-khan · Apr 14, 2021 · Mar 29, 2021 · Mar 30, 2021 · Mar 30, 2021
@@ -1954,6 +1954,7 @@ class Framework(EstimatorBase):
     INSTANCE_TYPE = "sagemaker_instance_type"
     MPI_NUM_PROCESSES_PER_HOST = "sagemaker_mpi_num_of_processes_per_host"
     MPI_CUSTOM_MPI_OPTIONS = "sagemaker_mpi_custom_mpi_options"
+    SM_DDP_CUSTOM_MPI_OPTIONS = "sagemaker_distributed_dataparallel_custom_mpi_options"
     CONTAINER_CODE_CHANNEL_SOURCEDIR_PATH = "/opt/ml/input/data/code/sourcedir.tar.gz"
 
     def __init__(
@@ -2629,6 +2630,10 @@ def _distribution_configuration(self, distribution):
             smdataparallel_enabled = smdistributed.get("dataparallel", {}).get("enabled", False)
             distribution_config[self.LAUNCH_SM_DDP_ENV_NAME] = smdataparallel_enabled
             distribution_config[self.INSTANCE_TYPE] = self.instance_type
+            if smdataparallel_enabled:
+                distribution_config[self.SM_DDP_CUSTOM_MPI_OPTIONS] = smdistributed[
+                    "dataparallel"
+                ].get("custom_mpi_options", "")
 
         return distribution_config
 

@@ -47,7 +47,9 @@ def test_smdataparallel_pt_mnist(
         sagemaker_session=sagemaker_session,
         framework_version=pytorch_training_latest_version,
         py_version=pytorch_training_latest_py_version,
-        distribution={"smdistributed": {"dataparallel": {"enabled": True}}},
+        distribution={
+            "smdistributed": {"dataparallel": {"enabled": True, "custom_mpi_options": "--verbose"}}
+        },
     )
 
     with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):

@@ -31,7 +31,6 @@
     integ.test_region() not in integ.DATA_PARALLEL_TESTING_REGIONS,
     reason="Only allow this test to run in IAD and CMH to limit usage of p3.16xlarge",
 )
-@pytest.mark.skip("Failing due to bad DLC image release. Disable temporarily.")
 def test_smdataparallel_tf_mnist(
     sagemaker_session,
     tensorflow_training_latest_version,
@@ -47,7 +46,9 @@ def test_smdataparallel_tf_mnist(
         sagemaker_session=sagemaker_session,
         framework_version=tensorflow_training_latest_version,
         py_version=tensorflow_training_latest_py_version,
-        distribution={"smdistributed": {"dataparallel": {"enabled": True}}},
+        distribution={
+            "smdistributed": {"dataparallel": {"enabled": True, "custom_mpi_options": "--verbose"}}
+        },
     )
 
     with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):

@@ -121,7 +121,9 @@
 DISTRIBUTION_MPI_ENABLED = {
     "mpi": {"enabled": True, "custom_mpi_options": "options", "processes_per_host": 2}
 }
-DISTRIBUTION_SM_DDP_ENABLED = {"smdistributed": {"dataparallel": {"enabled": True}}}
+DISTRIBUTION_SM_DDP_ENABLED = {
+    "smdistributed": {"dataparallel": {"enabled": True, "custom_mpi_options": "options"}}
+}
 
 
 class DummyFramework(Framework):
@@ -3290,6 +3292,7 @@ def test_framework_distribution_configuration(sagemaker_session):
     actual_ddp = framework._distribution_configuration(distribution=DISTRIBUTION_SM_DDP_ENABLED)
     expected_ddp = {
         "sagemaker_distributed_dataparallel_enabled": True,
+        "sagemaker_distributed_dataparallel_custom_mpi_options": "options",
         "sagemaker_instance_type": INSTANCE_TYPE,
     }
     assert actual_ddp == expected_ddp

@@ -553,11 +553,15 @@ def test_validate_version_or_image_args_raises():
 
 def test_validate_smdistributed_not_raises():
     smdataparallel_enabled = {"smdistributed": {"dataparallel": {"enabled": True}}}
+    smdataparallel_enabled_custom_mpi = {
+        "smdistributed": {"dataparallel": {"enabled": True, "custom_mpi_options": "--verbose"}}
+    }
     smdataparallel_disabled = {"smdistributed": {"dataparallel": {"enabled": False}}}
     instance_types = list(fw_utils.SM_DATAPARALLEL_SUPPORTED_INSTANCE_TYPES)
 
     good_args = [
         (smdataparallel_enabled, "custom-container"),
+        (smdataparallel_enabled_custom_mpi, "custom-container"),
         (smdataparallel_disabled, "custom-container"),
     ]
     frameworks = ["tensorflow", "pytorch"]
@@ -576,17 +580,17 @@ def test_validate_smdistributed_not_raises():
 
 def test_validate_smdistributed_raises():
     bad_args = [
-        {"smdistributed": {"dataparallel": {"enabled": True}}},
         {"smdistributed": "dummy"},
         {"smdistributed": {"dummy"}},
         {"smdistributed": {"dummy": "val"}},
         {"smdistributed": {"dummy": {"enabled": True}}},
     ]
+    instance_types = list(fw_utils.SM_DATAPARALLEL_SUPPORTED_INSTANCE_TYPES)
     frameworks = ["tensorflow", "pytorch"]
-    for framework, distribution in product(frameworks, bad_args):
+    for framework, distribution, instance_type in product(frameworks, bad_args, instance_types):
         with pytest.raises(ValueError):
             fw_utils.validate_smdistributed(
-                instance_type=None,
+                instance_type=instance_type,
                 framework_name=framework,
                 framework_version=None,
                 py_version=None,
@@ -624,6 +628,9 @@ def test_validate_smdataparallel_args_raises():
 
 def test_validate_smdataparallel_args_not_raises():
     smdataparallel_enabled = {"smdistributed": {"dataparallel": {"enabled": True}}}
+    smdataparallel_enabled_custom_mpi = {
+        "smdistributed": {"dataparallel": {"enabled": True, "custom_mpi_options": "--verbose"}}
+    }
     smdataparallel_disabled = {"smdistributed": {"dataparallel": {"enabled": False}}}
 
     # Cases {PT|TF2}
@@ -644,6 +651,8 @@ def test_validate_smdataparallel_args_not_raises():
         ("ml.p3.16xlarge", "pytorch", "1.8.0", "py3", smdataparallel_enabled),
         ("ml.p3.16xlarge", "pytorch", "1.8.1", "py3", smdataparallel_enabled),
         ("ml.p3.16xlarge", "pytorch", "1.8", "py3", smdataparallel_enabled),
+        ("ml.p3.16xlarge", "tensorflow", "2.4.1", "py3", smdataparallel_enabled_custom_mpi),
+        ("ml.p3.16xlarge", "pytorch", "1.8.0", "py3", smdataparallel_enabled_custom_mpi),
     ]
     for instance_type, framework_name, framework_version, py_version, distribution in good_args:
         fw_utils._validate_smdataparallel_args(