aws · laurenyu · Jun 2, 2020 · Jun 1, 2020 · Jun 2, 2020 · Jun 2, 2020
diff --git a/src/sagemaker/tensorflow/estimator.py b/src/sagemaker/tensorflow/estimator.py
@@ -29,27 +29,18 @@
 
 logger = logging.getLogger("sagemaker")
 
-# TODO: consider creating a function for generating this command before removing this constant
-_SCRIPT_MODE_TENSORBOARD_WARNING = (
-    "Tensorboard is not supported with script mode. You can run the following "
-    "command: tensorboard --logdir %s --host localhost --port 6006 This can be "
-    "run from anywhere with access to the S3 URI used as the logdir."
-)
-
 
 class TensorFlow(Framework):
     """Handle end-to-end training and deployment of user-provided TensorFlow code."""
 
     __framework_name__ = "tensorflow"
-    _SCRIPT_MODE_REPO_NAME = "tensorflow-scriptmode"
+    _ECR_REPO_NAME = "tensorflow-scriptmode"
 
     LATEST_VERSION = defaults.LATEST_VERSION
 
     _LATEST_1X_VERSION = "1.15.2"
 
     _HIGHEST_LEGACY_MODE_ONLY_VERSION = version.Version("1.10.0")
-    _LOWEST_SCRIPT_MODE_ONLY_VERSION = version.Version("1.13.1")
-
     _HIGHEST_PYTHON_2_VERSION = version.Version("2.1.0")
 
     def __init__(
@@ -59,7 +50,6 @@ def __init__(
         model_dir=None,
         image_name=None,
         distributions=None,
-        script_mode=True,
         **kwargs
     ):
         """Initialize a ``TensorFlow`` estimator.
@@ -82,6 +72,8 @@ def __init__(
                 * *Local Mode with local sources (file:// instead of s3://)* - \
                     ``/opt/ml/shared/model``
 
+                To disable having ``model_dir`` passed to your training script,
+                set ``model_dir=False``.
             image_name (str): If specified, the estimator will use this image for training and
                 hosting, instead of selecting the appropriate SageMaker official image based on
                 framework_version and py_version. It can be an ECR url or dockerhub image and tag.
@@ -114,8 +106,6 @@ def __init__(
                         }
                     }
 
-            script_mode (bool): Whether or not to use the Script Mode TensorFlow images
-                (default: True).
             **kwargs: Additional kwargs passed to the Framework constructor.
 
         .. tip::
@@ -154,7 +144,6 @@ def __init__(
         self.model_dir = model_dir
         self.distributions = distributions or {}
 
-        self._script_mode_enabled = script_mode
         self._validate_args(py_version=py_version, framework_version=self.framework_version)
 
     def _validate_args(self, py_version, framework_version):
@@ -171,30 +160,29 @@ def _validate_args(self, py_version, framework_version):
             )
             raise AttributeError(msg)
 
-        if (not self._script_mode_enabled) and self._only_script_mode_supported():
-            logger.warning(
-                "Legacy mode is deprecated in versions 1.13 and higher. Using script mode instead."
+        if self._only_legacy_mode_supported() and self.image_name is None:
+            legacy_image_uri = fw.create_image_uri(
+                self.sagemaker_session.boto_region_name,
+                "tensorflow",
+                self.train_instance_type,
+                self.framework_version,
+                self.py_version,
             )
-            self._script_mode_enabled = True
 
-        if self._only_legacy_mode_supported():
             # TODO: add link to docs to explain how to use legacy mode with v2
-            logger.warning(
-                "TF %s supports only legacy mode. If you were using any legacy mode parameters "
+            msg = (
+                "TF {} supports only legacy mode. Please supply the image URI directly with "
+                "'image_name={}' and set 'model_dir=False'. If you are using any legacy parameters "
                 "(training_steps, evaluation_steps, checkpoint_path, requirements_file), "
-                "make sure to pass them directly as hyperparameters instead.",
-                self.framework_version,
-            )
-            self._script_mode_enabled = False
+                "make sure to pass them directly as hyperparameters instead."
+            ).format(self.framework_version, legacy_image_uri)
+
+            raise ValueError(msg)
 
     def _only_legacy_mode_supported(self):
         """Placeholder docstring"""
         return version.Version(self.framework_version) <= self._HIGHEST_LEGACY_MODE_ONLY_VERSION
 
-    def _only_script_mode_supported(self):
-        """Placeholder docstring"""
-        return version.Version(self.framework_version) >= self._LOWEST_SCRIPT_MODE_ONLY_VERSION
-
     def _only_python_3_supported(self):
         """Placeholder docstring"""
         return version.Version(self.framework_version) > self._HIGHEST_PYTHON_2_VERSION
@@ -214,10 +202,6 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na
             job_details, model_channel_name
         )
 
-        model_dir = init_params["hyperparameters"].pop("model_dir", None)
-        if model_dir is not None:
-            init_params["model_dir"] = model_dir
-
         image_name = init_params.pop("image")
         framework, py_version, tag, script_mode = fw.framework_name_from_image(image_name)
         if not framework:
@@ -226,8 +210,11 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na
             init_params["image_name"] = image_name
             return init_params
 
-        if script_mode is None:
-            init_params["script_mode"] = False
+        model_dir = init_params["hyperparameters"].pop("model_dir", None)
+        if model_dir:
+            init_params["model_dir"] = model_dir
+        elif script_mode is None:
+            init_params["model_dir"] = False
 
         init_params["py_version"] = py_version
 
@@ -239,6 +226,10 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na
             "1.4" if tag == "1.0" else fw.framework_version_from_tag(tag)
         )
 
+        # Legacy images are required to be passed in explicitly.
+        if not script_mode:
+            init_params["image_name"] = image_name
+
         training_job_name = init_params["base_job_name"]
         if framework != cls.__framework_name__:
             raise ValueError(
@@ -309,27 +300,26 @@ def hyperparameters(self):
         hyperparameters = super(TensorFlow, self).hyperparameters()
         additional_hyperparameters = {}
 
-        if self._script_mode_enabled:
-            mpi_enabled = False
-
-            if "parameter_server" in self.distributions:
-                ps_enabled = self.distributions["parameter_server"].get("enabled", False)
-                additional_hyperparameters[self.LAUNCH_PS_ENV_NAME] = ps_enabled
+        if "parameter_server" in self.distributions:
+            ps_enabled = self.distributions["parameter_server"].get("enabled", False)
+            additional_hyperparameters[self.LAUNCH_PS_ENV_NAME] = ps_enabled
 
-            if "mpi" in self.distributions:
-                mpi_dict = self.distributions["mpi"]
-                mpi_enabled = mpi_dict.get("enabled", False)
-                additional_hyperparameters[self.LAUNCH_MPI_ENV_NAME] = mpi_enabled
+        mpi_enabled = False
+        if "mpi" in self.distributions:
+            mpi_dict = self.distributions["mpi"]
+            mpi_enabled = mpi_dict.get("enabled", False)
+            additional_hyperparameters[self.LAUNCH_MPI_ENV_NAME] = mpi_enabled
 
-                if mpi_dict.get("processes_per_host"):
-                    additional_hyperparameters[self.MPI_NUM_PROCESSES_PER_HOST] = mpi_dict.get(
-                        "processes_per_host"
-                    )
-
-                additional_hyperparameters[self.MPI_CUSTOM_MPI_OPTIONS] = mpi_dict.get(
-                    "custom_mpi_options", ""
+            if mpi_dict.get("processes_per_host"):
+                additional_hyperparameters[self.MPI_NUM_PROCESSES_PER_HOST] = mpi_dict.get(
+                    "processes_per_host"
                 )
 
+            additional_hyperparameters[self.MPI_CUSTOM_MPI_OPTIONS] = mpi_dict.get(
+                "custom_mpi_options", ""
+            )
+
+        if self.model_dir is not False:
             self.model_dir = self.model_dir or self._default_s3_path("model", mpi=mpi_enabled)
             additional_hyperparameters["model_dir"] = self.model_dir
 
@@ -375,16 +365,13 @@ def train_image(self):
         if self.image_name:
             return self.image_name
 
-        if self._script_mode_enabled:
-            return fw.create_image_uri(
-                self.sagemaker_session.boto_region_name,
-                self._SCRIPT_MODE_REPO_NAME,
-                self.train_instance_type,
-                self.framework_version,
-                self.py_version,
-            )
-
-        return super(TensorFlow, self).train_image()
+        return fw.create_image_uri(
+            self.sagemaker_session.boto_region_name,
+            self._ECR_REPO_NAME,
+            self.train_instance_type,
+            self.framework_version,
+            self.py_version,
+        )
 
     def transformer(
         self,

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -194,6 +194,21 @@ def xgboost_version(request):
         "1.9.0",
         "1.10",
         "1.10.0",
+        "1.11",
+        "1.11.0",
+        "1.12",
+        "1.12.0",
+        "1.13",
+        "1.14",
+        "1.14.0",
+        "1.15",
+        "1.15.0",
+        "1.15.2",
+        "2.0",
+        "2.0.0",
+        "2.0.1",
+        "2.1",
+        "2.1.0",
     ],
 )
 def tf_version(request):

diff --git a/tests/integ/test_airflow_config.py b/tests/integ/test_airflow_config.py
@@ -561,7 +561,6 @@ def test_tf_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_inst
             train_instance_count=SINGLE_INSTANCE_COUNT,
             train_instance_type=cpu_instance_type,
             sagemaker_session=sagemaker_session,
-            script_mode=True,
             framework_version=TensorFlow.LATEST_VERSION,
             py_version=PYTHON_VERSION,
             metric_definitions=[

diff --git a/tests/integ/test_horovod.py b/tests/integ/test_horovod.py
@@ -58,7 +58,6 @@ def test_horovod_local_mode(sagemaker_local_session, instances, processes, tmpdi
         train_instance_type="local",
         sagemaker_session=sagemaker_local_session,
         py_version=integ.PYTHON_VERSION,
-        script_mode=True,
         output_path=output_path,
         framework_version="1.12",
         distributions={"mpi": {"enabled": True, "processes_per_host": processes}},
@@ -106,7 +105,6 @@ def _create_and_fit_estimator(sagemaker_session, instance_type, tmpdir):
         train_instance_type=instance_type,
         sagemaker_session=sagemaker_session,
         py_version=integ.PYTHON_VERSION,
-        script_mode=True,
         framework_version="1.12",
         distributions={"mpi": {"enabled": True}},
     )

diff --git a/tests/integ/test_tf_script_mode.py → tests/integ/test_tf.py b/tests/integ/test_tf_script_mode.py → tests/integ/test_tf.py
@@ -59,7 +59,6 @@ def test_mnist_with_checkpoint_config(
         train_instance_count=1,
         train_instance_type=instance_type,
         sagemaker_session=sagemaker_session,
-        script_mode=True,
         framework_version=tf_full_version,
         py_version=py_version,
         metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}],
@@ -104,7 +103,6 @@ def test_server_side_encryption(sagemaker_session, tf_full_version, py_version):
             train_instance_count=1,
             train_instance_type="ml.c5.xlarge",
             sagemaker_session=sagemaker_session,
-            script_mode=True,
             framework_version=tf_full_version,
             py_version=py_version,
             code_location=output_path,
@@ -141,7 +139,6 @@ def test_mnist_distributed(sagemaker_session, instance_type, tf_full_version, py
         train_instance_type=instance_type,
         sagemaker_session=sagemaker_session,
         py_version=py_version,
-        script_mode=True,
         framework_version=tf_full_version,
         distributions=PARAMETER_SERVER_DISTRIBUTION,
     )
@@ -166,7 +163,6 @@ def test_mnist_async(sagemaker_session, cpu_instance_type, tf_full_version, py_v
         train_instance_type="ml.c5.4xlarge",
         py_version=tests.integ.PYTHON_VERSION,
         sagemaker_session=sagemaker_session,
-        script_mode=True,
         # testing py-sdk functionality, no need to run against all TF versions
         framework_version=TensorFlow.LATEST_VERSION,
         tags=TAGS,
@@ -209,7 +205,6 @@ def test_deploy_with_input_handlers(sagemaker_session, instance_type, tf_full_ve
         train_instance_type=instance_type,
         py_version=py_version,
         sagemaker_session=sagemaker_session,
-        script_mode=True,
         framework_version=tf_full_version,
         tags=TAGS,
     )

diff --git a/tests/integ/test_tf_efs_fsx.py b/tests/integ/test_tf_efs_fsx.py
@@ -65,7 +65,6 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
         train_instance_count=1,
         train_instance_type=cpu_instance_type,
         sagemaker_session=sagemaker_session,
-        script_mode=True,
         framework_version=TensorFlow.LATEST_VERSION,
         py_version=PY_VERSION,
         subnets=subnets,
@@ -105,7 +104,6 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
         train_instance_count=1,
         train_instance_type=cpu_instance_type,
         sagemaker_session=sagemaker_session,
-        script_mode=True,
         framework_version=TensorFlow.LATEST_VERSION,
         py_version=PY_VERSION,
         subnets=subnets,
@@ -130,7 +128,7 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
     reason="EFS integration tests need to be fixed before running in all regions.",
 )
-def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
+def test_tuning_tf_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     role = efs_fsx_setup["role_name"]
     subnets = [efs_fsx_setup["subnet_id"]]
     security_group_ids = efs_fsx_setup["security_group_ids"]
@@ -140,7 +138,6 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instanc
         role=role,
         train_instance_count=1,
         train_instance_type=cpu_instance_type,
-        script_mode=True,
         sagemaker_session=sagemaker_session,
         py_version=PY_VERSION,
         framework_version=TensorFlow.LATEST_VERSION,
@@ -178,7 +175,7 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instanc
     tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
     reason="EFS integration tests need to be fixed before running in all regions.",
 )
-def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
+def test_tuning_tf_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     role = efs_fsx_setup["role_name"]
     subnets = [efs_fsx_setup["subnet_id"]]
     security_group_ids = efs_fsx_setup["security_group_ids"]
@@ -188,7 +185,6 @@ def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session, cpu_inst
         role=role,
         train_instance_count=1,
         train_instance_type=cpu_instance_type,
-        script_mode=True,
         sagemaker_session=sagemaker_session,
         py_version=PY_VERSION,
         framework_version=TensorFlow.LATEST_VERSION,

diff --git a/tests/integ/test_transformer.py b/tests/integ/test_transformer.py
@@ -352,7 +352,6 @@ def test_transform_tf_kms_network_isolation(sagemaker_session, cpu_instance_type
         train_instance_count=1,
         train_instance_type=cpu_instance_type,
         framework_version=TensorFlow.LATEST_VERSION,
-        script_mode=True,
         py_version=PYTHON_VERSION,
         sagemaker_session=sagemaker_session,
     )

diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py
@@ -599,7 +599,6 @@ def test_tuning_tf_script_mode(sagemaker_session, cpu_instance_type, tf_full_ver
         role="SageMakerRole",
         train_instance_count=1,
         train_instance_type=cpu_instance_type,
-        script_mode=True,
         sagemaker_session=sagemaker_session,
         py_version=PYTHON_VERSION,
         framework_version=tf_full_version,