Skip to content

feature: Support TF2.12 SageMaker DLC #3776

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 33 additions & 1 deletion src/sagemaker/image_uri_config/tensorflow.json
Original file line number Diff line number Diff line change
Expand Up @@ -2021,7 +2021,8 @@
"2.8": "2.8.0",
"2.9": "2.9.2",
"2.10": "2.10.1",
"2.11": "2.11.0"
"2.11": "2.11.0",
"2.12": "2.12.0"
},
"versions": {
"1.10.0": {
Expand Down Expand Up @@ -3755,6 +3756,37 @@
"us-west-2": "763104351884"
},
"repository": "tensorflow-training"
},
"2.12.0": {
"py_versions": [
"py310"
],
"registries": {
"af-south-1": "626614931356",
"ap-east-1": "871362719292",
"ap-northeast-1": "763104351884",
"ap-northeast-2": "763104351884",
"ap-northeast-3": "364406365360",
"ap-south-1": "763104351884",
"ap-southeast-1": "763104351884",
"ap-southeast-2": "763104351884",
"ap-southeast-3": "907027046896",
"ap-southeast-4": "457447274322",
"ca-central-1": "763104351884",
"eu-central-1": "763104351884",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cn regions are to be added after the SM platform update right?

"eu-north-1": "763104351884",
"eu-south-1": "692866216735",
"eu-west-1": "763104351884",
"eu-west-2": "763104351884",
"eu-west-3": "763104351884",
"me-south-1": "217643126080",
"sa-east-1": "763104351884",
"us-east-1": "763104351884",
"us-east-2": "763104351884",
"us-west-1": "763104351884",
"us-west-2": "763104351884"
},
"repository": "tensorflow-training"
}
}
}
Expand Down
19 changes: 9 additions & 10 deletions src/sagemaker/image_uris.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,16 +369,15 @@ def _config_for_framework_and_scope(framework, image_scope, accelerator_type=Non

def _validate_instance_deprecation(framework, instance_type, version):
"""Check if instance type is deprecated for a certain framework with a certain version"""
if (
framework == "pytorch"
and _get_instance_type_family(instance_type) == "p2"
and Version(version) >= Version("1.13")
):
raise ValueError(
"P2 instances have been deprecated for sagemaker jobs with PyTorch 1.13 and above. "
"For information about supported instance types please refer to "
"https://aws.amazon.com/sagemaker/pricing/"
)
if _get_instance_type_family(instance_type) == "p2":
if (framework == "pytorch" and Version(version) >= Version("1.13")) or (
framework == "tensorflow" and Version(version) >= Version("2.12")
):
raise ValueError(
"P2 instances have been deprecated for sagemaker jobs starting PyTorch 1.13 and TensorFlow 2.12"
"For information about supported instance types please refer to "
"https://aws.amazon.com/sagemaker/pricing/"
)


def _validate_for_suppported_frameworks_and_instance_type(framework, instance_type):
Expand Down
4 changes: 3 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,9 @@ def _tf_py_version(tf_version, request):
return "py37"
if Version("2.6") <= version < Version("2.8"):
return "py38"
return "py39"
if Version("2.8") <= version < Version("2.12"):
return "py39"
return "py310"


@pytest.fixture(scope="module")
Expand Down
6 changes: 4 additions & 2 deletions tests/integ/test_training_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,10 @@ def test_tensorflow(
"""
Test the TensorFlow estimator
"""
if version.parse(tensorflow_training_latest_version) < version.parse("2.9"):
pytest.skip("Training Compiler only supports TF >= 2.9")
if version.parse(tensorflow_training_latest_version) >= version.parse("2.12") or version.parse(
tensorflow_training_latest_version
) < version.parse("2.9"):
pytest.skip("Training Compiler only supports TF >= 2.9 and < 2.12")
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
epochs = 10
batch = 256
Expand Down
14 changes: 13 additions & 1 deletion tests/unit/sagemaker/image_uris/test_dlc_frameworks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
from sagemaker import image_uris
from tests.unit.sagemaker.image_uris import expected_uris

import pytest

INSTANCE_TYPES_AND_PROCESSORS = (("ml.c4.xlarge", "cpu"), ("ml.p2.xlarge", "gpu"))
RENEWED_PYTORCH_INSTANCE_TYPES_AND_PROCESSORS = (("ml.c4.xlarge", "cpu"), ("ml.g4dn.xlarge", "gpu"))
REGION = "us-west-2"
Expand Down Expand Up @@ -72,7 +74,9 @@ def _test_image_uris(
}

TYPES_AND_PROCESSORS = INSTANCE_TYPES_AND_PROCESSORS
if framework == "pytorch" and Version(fw_version) >= Version("1.13"):
if (framework == "pytorch" and Version(fw_version) >= Version("1.13")) or (
framework == "tensorflow" and Version(fw_version) >= Version("2.12")
):
"""Handle P2 deprecation"""
TYPES_AND_PROCESSORS = RENEWED_PYTORCH_INSTANCE_TYPES_AND_PROCESSORS

Expand All @@ -83,6 +87,14 @@ def _test_image_uris(
assert expected == uri

for region in SAGEMAKER_ALTERNATE_REGION_ACCOUNTS.keys():
if (
scope == "training"
and framework == "tensorflow"
and Version(fw_version) == Version("2.12")
):
if region in ["cn-north-1", "cn-northwest-1", "us-iso-east-1", "us-isob-east-1"]:
pytest.skip(f"TF 2.12 SM DLC is not available in {region} region")

uri = image_uris.retrieve(region=region, instance_type="ml.c4.xlarge", **base_args)

expected = expected_fn(region=region, **expected_fn_args)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@

@pytest.fixture(scope="module", autouse=True)
def skip_if_incompatible(tensorflow_training_version, request):
if version.parse(tensorflow_training_version) < version.parse("2.9"):
pytest.skip("Training Compiler only supports TF >= 2.9")
if version.parse(tensorflow_training_version) >= version.parse("2.12") or version.parse(
tensorflow_training_version
) < version.parse("2.9"):
pytest.skip("Training Compiler only supports TF >= 2.9 and < 2.12")


@pytest.fixture(scope="module")
Expand Down