Skip to content

Commit 116f059

Browse files
committed
Merge branch 'master' of github.com:aws/sagemaker-python-sdk into add_tf_2.7_2.8
2 parents 56c39f9 + d52b8a9 commit 116f059

File tree

7 files changed

+154
-48
lines changed

7 files changed

+154
-48
lines changed

.readthedocs.yml renamed to .readthedocs.yaml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,19 @@
44

55
version: 2
66

7+
build:
8+
os: ubuntu-20.04
9+
tools:
10+
python: "3.9"
11+
12+
713
python:
8-
version: 3.9
914
install:
1015
- method: pip
1116
path: .
1217
- requirements: doc/requirements.txt
1318

19+
1420
sphinx:
1521
configuration: doc/conf.py
1622
fail_on_warning: true # http://www.sphinx-doc.org/en/master/man/sphinx-build.html#id6

src/sagemaker/fw_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
"2.6",
7575
"2.6.0",
7676
"2.6.2",
77+
"2.6.3",
7778
"2.8",
7879
"2.8.0",
7980
],

src/sagemaker/image_uri_config/tensorflow.json

Lines changed: 66 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@
279279
"2.3": "2.3.2",
280280
"2.4": "2.4.3",
281281
"2.5": "2.5.1",
282-
"2.6": "2.6.0",
282+
"2.6": "2.6.3",
283283
"2.8": "2.8.0"
284284
},
285285
"versions": {
@@ -1345,6 +1345,36 @@
13451345
},
13461346
"repository": "tensorflow-inference"
13471347
},
1348+
"2.6.3": {
1349+
"registries": {
1350+
"af-south-1": "626614931356",
1351+
"ap-east-1": "871362719292",
1352+
"ap-northeast-1": "763104351884",
1353+
"ap-northeast-2": "763104351884",
1354+
"ap-northeast-3": "364406365360",
1355+
"ap-south-1": "763104351884",
1356+
"ap-southeast-1": "763104351884",
1357+
"ap-southeast-2": "763104351884",
1358+
"ca-central-1": "763104351884",
1359+
"cn-north-1": "727897471807",
1360+
"cn-northwest-1": "727897471807",
1361+
"eu-central-1": "763104351884",
1362+
"eu-north-1": "763104351884",
1363+
"eu-south-1": "692866216735",
1364+
"eu-west-1": "763104351884",
1365+
"eu-west-2": "763104351884",
1366+
"eu-west-3": "763104351884",
1367+
"me-south-1": "217643126080",
1368+
"sa-east-1": "763104351884",
1369+
"us-east-1": "763104351884",
1370+
"us-east-2": "763104351884",
1371+
"us-gov-west-1": "442386744353",
1372+
"us-iso-east-1": "886529160074",
1373+
"us-west-1": "763104351884",
1374+
"us-west-2": "763104351884"
1375+
},
1376+
"repository": "tensorflow-inference"
1377+
},
13481378
"2.8.0": {
13491379
"registries": {
13501380
"af-south-1": "626614931356",
@@ -1401,8 +1431,7 @@
14011431
"2.3": "2.3.2",
14021432
"2.4": "2.4.3",
14031433
"2.5": "2.5.1",
1404-
"2.6": "2.6.2",
1405-
"2.7": "2.7.1",
1434+
"2.6": "2.6.3",
14061435
"2.8": "2.8.0"
14071436
},
14081437
"versions": {
@@ -2663,9 +2692,42 @@
26632692
},
26642693
"repository": "tensorflow-training"
26652694
},
2695+
"2.6.3": {
2696+
"py_versions": [
2697+
"py38"
2698+
],
2699+
"registries": {
2700+
"af-south-1": "626614931356",
2701+
"ap-east-1": "871362719292",
2702+
"ap-northeast-1": "763104351884",
2703+
"ap-northeast-2": "763104351884",
2704+
"ap-northeast-3": "364406365360",
2705+
"ap-south-1": "763104351884",
2706+
"ap-southeast-1": "763104351884",
2707+
"ap-southeast-2": "763104351884",
2708+
"ca-central-1": "763104351884",
2709+
"cn-north-1": "727897471807",
2710+
"cn-northwest-1": "727897471807",
2711+
"eu-central-1": "763104351884",
2712+
"eu-north-1": "763104351884",
2713+
"eu-south-1": "692866216735",
2714+
"eu-west-1": "763104351884",
2715+
"eu-west-2": "763104351884",
2716+
"eu-west-3": "763104351884",
2717+
"me-south-1": "217643126080",
2718+
"sa-east-1": "763104351884",
2719+
"us-east-1": "763104351884",
2720+
"us-east-2": "763104351884",
2721+
"us-gov-west-1": "442386744353",
2722+
"us-iso-east-1": "886529160074",
2723+
"us-west-1": "763104351884",
2724+
"us-west-2": "763104351884"
2725+
},
2726+
"repository": "tensorflow-training"
2727+
},
26662728
"2.8.0": {
26672729
"py_versions": [
2668-
"py39"
2730+
"py39",
26692731
],
26702732
"registries": {
26712733
"af-south-1": "626614931356",

tests/conftest.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,15 @@ def gpu_instance_type(sagemaker_session, request):
409409
return "ml.p3.2xlarge"
410410

411411

412+
@pytest.fixture(scope="session")
413+
def gpu_instance_type_list(sagemaker_session, request):
414+
region = sagemaker_session.boto_session.region_name
415+
if region in NO_P3_REGIONS:
416+
return ["ml.p2.xlarge"]
417+
else:
418+
return ["ml.p3.2xlarge", "ml.p2.xlarge"]
419+
420+
412421
@pytest.fixture(scope="session")
413422
def inf_instance_type(sagemaker_session, request):
414423
return "ml.inf1.xlarge"

tests/integ/test_huggingface.py

Lines changed: 33 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,15 @@
1515
import os
1616

1717
import pytest
18+
import logging
1819

1920
from sagemaker.huggingface import HuggingFace, HuggingFaceProcessor
2021
from sagemaker.huggingface.model import HuggingFaceModel, HuggingFacePredictor
2122
from sagemaker.utils import unique_name_from_base
2223
from tests import integ
2324
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2425
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
26+
from sagemaker.exceptions import UnexpectedStatusException
2527

2628
ROLE = "SageMakerRole"
2729

@@ -34,32 +36,41 @@
3436
)
3537
def test_framework_processing_job_with_deps(
3638
sagemaker_session,
37-
gpu_instance_type,
39+
gpu_instance_type_list,
3840
huggingface_training_latest_version,
3941
huggingface_training_pytorch_latest_version,
4042
huggingface_pytorch_latest_training_py_version,
4143
):
42-
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
43-
code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs")
44-
entry_point = "main_script.py"
45-
46-
processor = HuggingFaceProcessor(
47-
transformers_version=huggingface_training_latest_version,
48-
pytorch_version=huggingface_training_pytorch_latest_version,
49-
py_version=huggingface_pytorch_latest_training_py_version,
50-
role=ROLE,
51-
instance_count=1,
52-
instance_type=gpu_instance_type,
53-
sagemaker_session=sagemaker_session,
54-
base_job_name="test-huggingface",
55-
)
56-
57-
processor.run(
58-
code=entry_point,
59-
source_dir=code_path,
60-
inputs=[],
61-
wait=True,
62-
)
44+
for i_type in gpu_instance_type_list:
45+
logging.info("Using the instance type: {}".format(i_type))
46+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
47+
code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs")
48+
entry_point = "main_script.py"
49+
50+
processor = HuggingFaceProcessor(
51+
transformers_version=huggingface_training_latest_version,
52+
pytorch_version=huggingface_training_pytorch_latest_version,
53+
py_version=huggingface_pytorch_latest_training_py_version,
54+
role=ROLE,
55+
instance_count=1,
56+
instance_type=i_type,
57+
sagemaker_session=sagemaker_session,
58+
base_job_name="test-huggingface",
59+
)
60+
try:
61+
processor.run(
62+
code=entry_point,
63+
source_dir=code_path,
64+
inputs=[],
65+
wait=True,
66+
)
67+
except UnexpectedStatusException as e:
68+
if "CapacityError" in str(e) and i_type != gpu_instance_type_list[-1]:
69+
logging.warning("Failure using instance type: {}. {}".format(i_type, str(e)))
70+
continue
71+
else:
72+
raise
73+
break
6374

6475

6576
@pytest.mark.release

tests/integ/test_tf.py

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import numpy as np
1616
import os
1717
import time
18+
import logging
1819

1920
import pytest
2021

@@ -25,6 +26,8 @@
2526
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES, kms_utils, timeout
2627
from tests.integ.retry import retries
2728
from tests.integ.s3_utils import assert_s3_file_patterns_exist
29+
from sagemaker.exceptions import UnexpectedStatusException
30+
2831

2932
ROLE = "SageMakerRole"
3033

@@ -42,30 +45,39 @@
4245
@pytest.mark.release
4346
def test_framework_processing_job_with_deps(
4447
sagemaker_session,
45-
instance_type,
48+
gpu_instance_type_list,
4649
tensorflow_training_latest_version,
4750
tensorflow_training_latest_py_version,
4851
):
49-
with timeout.timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
50-
code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs")
51-
entry_point = "main_script.py"
52-
53-
processor = TensorFlowProcessor(
54-
framework_version=tensorflow_training_latest_version,
55-
py_version=tensorflow_training_latest_py_version,
56-
role=ROLE,
57-
instance_count=1,
58-
instance_type=instance_type,
59-
sagemaker_session=sagemaker_session,
60-
base_job_name="test-tensorflow",
61-
)
62-
63-
processor.run(
64-
code=entry_point,
65-
source_dir=code_path,
66-
inputs=[],
67-
wait=True,
68-
)
52+
for i_type in gpu_instance_type_list:
53+
logging.info("Using the instance type: {}".format(i_type))
54+
with timeout.timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
55+
code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs")
56+
entry_point = "main_script.py"
57+
58+
processor = TensorFlowProcessor(
59+
framework_version=tensorflow_training_latest_version,
60+
py_version=tensorflow_training_latest_py_version,
61+
role=ROLE,
62+
instance_count=1,
63+
instance_type=i_type,
64+
sagemaker_session=sagemaker_session,
65+
base_job_name="test-tensorflow",
66+
)
67+
try:
68+
processor.run(
69+
code=entry_point,
70+
source_dir=code_path,
71+
inputs=[],
72+
wait=True,
73+
)
74+
except UnexpectedStatusException as e:
75+
if "CapacityError" in str(e) and i_type != gpu_instance_type_list[-1]:
76+
logging.warning("Failure using instance type: {}. {}".format(i_type, str(e)))
77+
continue
78+
else:
79+
raise
80+
break
6981

7082

7183
def test_mnist_with_checkpoint_config(

tests/unit/test_fw_utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -685,6 +685,7 @@ def test_validate_smdataparallel_args_not_raises():
685685
("ml.p3.16xlarge", "tensorflow", "2.5", "py37", smdataparallel_enabled),
686686
("ml.p3.16xlarge", "tensorflow", "2.6.0", "py38", smdataparallel_enabled),
687687
("ml.p3.16xlarge", "tensorflow", "2.6.2", "py38", smdataparallel_enabled),
688+
("ml.p3.16xlarge", "tensorflow", "2.6.3", "py38", smdataparallel_enabled),
688689
("ml.p3.16xlarge", "tensorflow", "2.6", "py38", smdataparallel_enabled),
689690
("ml.p3.16xlarge", "tensorflow", "2.8.0", "py39", smdataparallel_enabled),
690691
("ml.p3.16xlarge", "tensorflow", "2.8", "py39", smdataparallel_enabled),
@@ -698,10 +699,14 @@ def test_validate_smdataparallel_args_not_raises():
698699
("ml.p3.16xlarge", "pytorch", "1.9.1", "py38", smdataparallel_enabled),
699700
("ml.p3.16xlarge", "pytorch", "1.9", "py38", smdataparallel_enabled),
700701
("ml.p3.16xlarge", "pytorch", "1.10", "py38", smdataparallel_enabled),
702+
("ml.p3.16xlarge", "tensorflow", "2.4.1", "py3", smdataparallel_enabled_custom_mpi),
703+
("ml.p3.16xlarge", "tensorflow", "2.4.1", "py37", smdataparallel_enabled_custom_mpi),
701704
("ml.p3.16xlarge", "tensorflow", "2.4.3", "py3", smdataparallel_enabled_custom_mpi),
702705
("ml.p3.16xlarge", "tensorflow", "2.4.3", "py37", smdataparallel_enabled_custom_mpi),
703706
("ml.p3.16xlarge", "tensorflow", "2.5.1", "py37", smdataparallel_enabled_custom_mpi),
707+
("ml.p3.16xlarge", "tensorflow", "2.6.0", "py38", smdataparallel_enabled_custom_mpi),
704708
("ml.p3.16xlarge", "tensorflow", "2.6.2", "py38", smdataparallel_enabled_custom_mpi),
709+
("ml.p3.16xlarge", "tensorflow", "2.6.3", "py38", smdataparallel_enabled_custom_mpi),
705710
("ml.p3.16xlarge", "tensorflow", "2.8.0", "py39", smdataparallel_enabled_custom_mpi),
706711
("ml.p3.16xlarge", "pytorch", "1.8.0", "py3", smdataparallel_enabled_custom_mpi),
707712
("ml.p3.16xlarge", "pytorch", "1.9.1", "py38", smdataparallel_enabled_custom_mpi),

0 commit comments

Comments
 (0)