Skip to content

breaking: force image_uri to be passed for legacy TF images #1539

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 49 additions & 62 deletions src/sagemaker/tensorflow/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,27 +29,18 @@

logger = logging.getLogger("sagemaker")

# TODO: consider creating a function for generating this command before removing this constant
_SCRIPT_MODE_TENSORBOARD_WARNING = (
"Tensorboard is not supported with script mode. You can run the following "
"command: tensorboard --logdir %s --host localhost --port 6006 This can be "
"run from anywhere with access to the S3 URI used as the logdir."
)


class TensorFlow(Framework):
"""Handle end-to-end training and deployment of user-provided TensorFlow code."""

__framework_name__ = "tensorflow"
_SCRIPT_MODE_REPO_NAME = "tensorflow-scriptmode"
_ECR_REPO_NAME = "tensorflow-scriptmode"

LATEST_VERSION = defaults.LATEST_VERSION

_LATEST_1X_VERSION = "1.15.2"

_HIGHEST_LEGACY_MODE_ONLY_VERSION = version.Version("1.10.0")
_LOWEST_SCRIPT_MODE_ONLY_VERSION = version.Version("1.13.1")

_HIGHEST_PYTHON_2_VERSION = version.Version("2.1.0")

def __init__(
Expand All @@ -59,7 +50,6 @@ def __init__(
model_dir=None,
image_name=None,
distributions=None,
script_mode=True,
**kwargs
):
"""Initialize a ``TensorFlow`` estimator.
Expand All @@ -82,6 +72,8 @@ def __init__(
* *Local Mode with local sources (file:// instead of s3://)* - \
``/opt/ml/shared/model``

To disable having ``model_dir`` passed to your training script,
set ``model_dir=False``.
image_name (str): If specified, the estimator will use this image for training and
hosting, instead of selecting the appropriate SageMaker official image based on
framework_version and py_version. It can be an ECR url or dockerhub image and tag.
Expand Down Expand Up @@ -114,8 +106,6 @@ def __init__(
}
}

script_mode (bool): Whether or not to use the Script Mode TensorFlow images
(default: True).
**kwargs: Additional kwargs passed to the Framework constructor.

.. tip::
Expand Down Expand Up @@ -154,7 +144,6 @@ def __init__(
self.model_dir = model_dir
self.distributions = distributions or {}

self._script_mode_enabled = script_mode
self._validate_args(py_version=py_version, framework_version=self.framework_version)

def _validate_args(self, py_version, framework_version):
Expand All @@ -171,30 +160,29 @@ def _validate_args(self, py_version, framework_version):
)
raise AttributeError(msg)

if (not self._script_mode_enabled) and self._only_script_mode_supported():
logger.warning(
"Legacy mode is deprecated in versions 1.13 and higher. Using script mode instead."
if self._only_legacy_mode_supported() and self.image_name is None:
legacy_image_uri = fw.create_image_uri(
self.sagemaker_session.boto_region_name,
"tensorflow",
self.train_instance_type,
self.framework_version,
self.py_version,
)
self._script_mode_enabled = True

if self._only_legacy_mode_supported():
# TODO: add link to docs to explain how to use legacy mode with v2
logger.warning(
"TF %s supports only legacy mode. If you were using any legacy mode parameters "
msg = (
"TF {} supports only legacy mode. Please supply the image URI directly with "
"'image_name={}' and set 'model_dir=False'. If you are using any legacy parameters "
"(training_steps, evaluation_steps, checkpoint_path, requirements_file), "
"make sure to pass them directly as hyperparameters instead.",
self.framework_version,
)
self._script_mode_enabled = False
"make sure to pass them directly as hyperparameters instead."
).format(self.framework_version, legacy_image_uri)

raise ValueError(msg)

def _only_legacy_mode_supported(self):
"""Placeholder docstring"""
return version.Version(self.framework_version) <= self._HIGHEST_LEGACY_MODE_ONLY_VERSION

def _only_script_mode_supported(self):
"""Placeholder docstring"""
return version.Version(self.framework_version) >= self._LOWEST_SCRIPT_MODE_ONLY_VERSION

def _only_python_3_supported(self):
"""Placeholder docstring"""
return version.Version(self.framework_version) > self._HIGHEST_PYTHON_2_VERSION
Expand All @@ -214,10 +202,6 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na
job_details, model_channel_name
)

model_dir = init_params["hyperparameters"].pop("model_dir", None)
if model_dir is not None:
init_params["model_dir"] = model_dir

image_name = init_params.pop("image")
framework, py_version, tag, script_mode = fw.framework_name_from_image(image_name)
if not framework:
Expand All @@ -226,8 +210,11 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na
init_params["image_name"] = image_name
return init_params

if script_mode is None:
init_params["script_mode"] = False
model_dir = init_params["hyperparameters"].pop("model_dir", None)
if model_dir:
init_params["model_dir"] = model_dir
elif script_mode is None:
init_params["model_dir"] = False

init_params["py_version"] = py_version

Expand All @@ -239,6 +226,10 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na
"1.4" if tag == "1.0" else fw.framework_version_from_tag(tag)
)

# Legacy images are required to be passed in explicitly.
if not script_mode:
init_params["image_name"] = image_name

training_job_name = init_params["base_job_name"]
if framework != cls.__framework_name__:
raise ValueError(
Expand Down Expand Up @@ -309,27 +300,26 @@ def hyperparameters(self):
hyperparameters = super(TensorFlow, self).hyperparameters()
additional_hyperparameters = {}

if self._script_mode_enabled:
mpi_enabled = False

if "parameter_server" in self.distributions:
ps_enabled = self.distributions["parameter_server"].get("enabled", False)
additional_hyperparameters[self.LAUNCH_PS_ENV_NAME] = ps_enabled
if "parameter_server" in self.distributions:
ps_enabled = self.distributions["parameter_server"].get("enabled", False)
additional_hyperparameters[self.LAUNCH_PS_ENV_NAME] = ps_enabled

if "mpi" in self.distributions:
mpi_dict = self.distributions["mpi"]
mpi_enabled = mpi_dict.get("enabled", False)
additional_hyperparameters[self.LAUNCH_MPI_ENV_NAME] = mpi_enabled
mpi_enabled = False
if "mpi" in self.distributions:
mpi_dict = self.distributions["mpi"]
mpi_enabled = mpi_dict.get("enabled", False)
additional_hyperparameters[self.LAUNCH_MPI_ENV_NAME] = mpi_enabled

if mpi_dict.get("processes_per_host"):
additional_hyperparameters[self.MPI_NUM_PROCESSES_PER_HOST] = mpi_dict.get(
"processes_per_host"
)

additional_hyperparameters[self.MPI_CUSTOM_MPI_OPTIONS] = mpi_dict.get(
"custom_mpi_options", ""
if mpi_dict.get("processes_per_host"):
additional_hyperparameters[self.MPI_NUM_PROCESSES_PER_HOST] = mpi_dict.get(
"processes_per_host"
)

additional_hyperparameters[self.MPI_CUSTOM_MPI_OPTIONS] = mpi_dict.get(
"custom_mpi_options", ""
)

if self.model_dir is not False:
self.model_dir = self.model_dir or self._default_s3_path("model", mpi=mpi_enabled)
additional_hyperparameters["model_dir"] = self.model_dir

Expand Down Expand Up @@ -375,16 +365,13 @@ def train_image(self):
if self.image_name:
return self.image_name

if self._script_mode_enabled:
return fw.create_image_uri(
self.sagemaker_session.boto_region_name,
self._SCRIPT_MODE_REPO_NAME,
self.train_instance_type,
self.framework_version,
self.py_version,
)

return super(TensorFlow, self).train_image()
return fw.create_image_uri(
self.sagemaker_session.boto_region_name,
self._ECR_REPO_NAME,
self.train_instance_type,
self.framework_version,
self.py_version,
)

def transformer(
self,
Expand Down
15 changes: 15 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,21 @@ def xgboost_version(request):
"1.9.0",
"1.10",
"1.10.0",
"1.11",
"1.11.0",
"1.12",
"1.12.0",
"1.13",
"1.14",
"1.14.0",
"1.15",
"1.15.0",
"1.15.2",
"2.0",
"2.0.0",
"2.0.1",
"2.1",
"2.1.0",
],
)
def tf_version(request):
Expand Down
1 change: 0 additions & 1 deletion tests/integ/test_airflow_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,6 @@ def test_tf_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_inst
train_instance_count=SINGLE_INSTANCE_COUNT,
train_instance_type=cpu_instance_type,
sagemaker_session=sagemaker_session,
script_mode=True,
framework_version=TensorFlow.LATEST_VERSION,
py_version=PYTHON_VERSION,
metric_definitions=[
Expand Down
2 changes: 0 additions & 2 deletions tests/integ/test_horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def test_horovod_local_mode(sagemaker_local_session, instances, processes, tmpdi
train_instance_type="local",
sagemaker_session=sagemaker_local_session,
py_version=integ.PYTHON_VERSION,
script_mode=True,
output_path=output_path,
framework_version="1.12",
distributions={"mpi": {"enabled": True, "processes_per_host": processes}},
Expand Down Expand Up @@ -106,7 +105,6 @@ def _create_and_fit_estimator(sagemaker_session, instance_type, tmpdir):
train_instance_type=instance_type,
sagemaker_session=sagemaker_session,
py_version=integ.PYTHON_VERSION,
script_mode=True,
framework_version="1.12",
distributions={"mpi": {"enabled": True}},
)
Expand Down
5 changes: 0 additions & 5 deletions tests/integ/test_tf_script_mode.py → tests/integ/test_tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ def test_mnist_with_checkpoint_config(
train_instance_count=1,
train_instance_type=instance_type,
sagemaker_session=sagemaker_session,
script_mode=True,
framework_version=tf_full_version,
py_version=py_version,
metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}],
Expand Down Expand Up @@ -104,7 +103,6 @@ def test_server_side_encryption(sagemaker_session, tf_full_version, py_version):
train_instance_count=1,
train_instance_type="ml.c5.xlarge",
sagemaker_session=sagemaker_session,
script_mode=True,
framework_version=tf_full_version,
py_version=py_version,
code_location=output_path,
Expand Down Expand Up @@ -141,7 +139,6 @@ def test_mnist_distributed(sagemaker_session, instance_type, tf_full_version, py
train_instance_type=instance_type,
sagemaker_session=sagemaker_session,
py_version=py_version,
script_mode=True,
framework_version=tf_full_version,
distributions=PARAMETER_SERVER_DISTRIBUTION,
)
Expand All @@ -166,7 +163,6 @@ def test_mnist_async(sagemaker_session, cpu_instance_type, tf_full_version, py_v
train_instance_type="ml.c5.4xlarge",
py_version=tests.integ.PYTHON_VERSION,
sagemaker_session=sagemaker_session,
script_mode=True,
# testing py-sdk functionality, no need to run against all TF versions
framework_version=TensorFlow.LATEST_VERSION,
tags=TAGS,
Expand Down Expand Up @@ -209,7 +205,6 @@ def test_deploy_with_input_handlers(sagemaker_session, instance_type, tf_full_ve
train_instance_type=instance_type,
py_version=py_version,
sagemaker_session=sagemaker_session,
script_mode=True,
framework_version=tf_full_version,
tags=TAGS,
)
Expand Down
8 changes: 2 additions & 6 deletions tests/integ/test_tf_efs_fsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
train_instance_count=1,
train_instance_type=cpu_instance_type,
sagemaker_session=sagemaker_session,
script_mode=True,
framework_version=TensorFlow.LATEST_VERSION,
py_version=PY_VERSION,
subnets=subnets,
Expand Down Expand Up @@ -105,7 +104,6 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
train_instance_count=1,
train_instance_type=cpu_instance_type,
sagemaker_session=sagemaker_session,
script_mode=True,
framework_version=TensorFlow.LATEST_VERSION,
py_version=PY_VERSION,
subnets=subnets,
Expand All @@ -130,7 +128,7 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
def test_tuning_tf_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
role = efs_fsx_setup["role_name"]
subnets = [efs_fsx_setup["subnet_id"]]
security_group_ids = efs_fsx_setup["security_group_ids"]
Expand All @@ -140,7 +138,6 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instanc
role=role,
train_instance_count=1,
train_instance_type=cpu_instance_type,
script_mode=True,
sagemaker_session=sagemaker_session,
py_version=PY_VERSION,
framework_version=TensorFlow.LATEST_VERSION,
Expand Down Expand Up @@ -178,7 +175,7 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instanc
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
def test_tuning_tf_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
role = efs_fsx_setup["role_name"]
subnets = [efs_fsx_setup["subnet_id"]]
security_group_ids = efs_fsx_setup["security_group_ids"]
Expand All @@ -188,7 +185,6 @@ def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session, cpu_inst
role=role,
train_instance_count=1,
train_instance_type=cpu_instance_type,
script_mode=True,
sagemaker_session=sagemaker_session,
py_version=PY_VERSION,
framework_version=TensorFlow.LATEST_VERSION,
Expand Down
1 change: 0 additions & 1 deletion tests/integ/test_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,6 @@ def test_transform_tf_kms_network_isolation(sagemaker_session, cpu_instance_type
train_instance_count=1,
train_instance_type=cpu_instance_type,
framework_version=TensorFlow.LATEST_VERSION,
script_mode=True,
py_version=PYTHON_VERSION,
sagemaker_session=sagemaker_session,
)
Expand Down
1 change: 0 additions & 1 deletion tests/integ/test_tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,6 @@ def test_tuning_tf_script_mode(sagemaker_session, cpu_instance_type, tf_full_ver
role="SageMakerRole",
train_instance_count=1,
train_instance_type=cpu_instance_type,
script_mode=True,
sagemaker_session=sagemaker_session,
py_version=PYTHON_VERSION,
framework_version=tf_full_version,
Expand Down
Loading