Skip to content

Commit 7397934

Browse files
mufaddal-rohawalajerrypeng7773
authored andcommitted
fix: integs fallback from p3 to p2 instance (aws#3168)
1 parent 47e1453 commit 7397934

File tree

6 files changed

+47
-10
lines changed

6 files changed

+47
-10
lines changed

tests/conftest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
"ca-central-1", # it has p3, but not enough
4747
"eu-central-1", # it has p3, but not enough
4848
"eu-north-1",
49+
"eu-west-1", # it has p3, but not enough
4950
"eu-west-2", # it has p3, but not enough
5051
"eu-west-3",
5152
"eu-south-1",

tests/integ/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
"ca-central-1", # it has p3, but not enough
7575
"eu-central-1", # it has p3, but not enough
7676
"eu-north-1",
77+
"eu-west-1", # it has p3, but not enough
7778
"eu-west-2", # it has p3, but not enough
7879
"eu-west-3",
7980
"eu-south-1",

tests/integ/test_horovod.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
import sagemaker.utils
2424
import tests.integ as integ
25+
from tests.integ.utils import gpu_list, retry_with_instance_list
2526
from sagemaker.tensorflow import TensorFlow
2627
from tests.integ import timeout
2728

@@ -51,18 +52,19 @@ def test_hvd_cpu(
5152
and integ.test_region() in integ.TRAINING_NO_P3_REGIONS,
5253
reason="no ml.p2 or ml.p3 instances in this region",
5354
)
55+
@retry_with_instance_list(gpu_list(integ.test_region()))
5456
def test_hvd_gpu(
5557
sagemaker_session,
5658
tensorflow_training_latest_version,
5759
tensorflow_training_latest_py_version,
58-
gpu_instance_type,
5960
tmpdir,
61+
**kwargs,
6062
):
6163
_create_and_fit_estimator(
6264
sagemaker_session,
6365
tensorflow_training_latest_version,
6466
tensorflow_training_latest_py_version,
65-
gpu_instance_type,
67+
kwargs["instance_type"],
6668
tmpdir,
6769
)
6870

tests/integ/test_horovod_mx.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import tests.integ as integ
2525
from sagemaker.mxnet import MXNet
2626
from tests.integ import timeout
27+
from tests.integ.utils import gpu_list, retry_with_instance_list
2728

2829
horovod_dir = os.path.join(os.path.dirname(__file__), "..", "data", "horovod")
2930

@@ -51,18 +52,19 @@ def test_hvd_cpu(
5152
and integ.test_region() in integ.TRAINING_NO_P3_REGIONS,
5253
reason="no ml.p2 or ml.p3 instances in this region",
5354
)
55+
@retry_with_instance_list(gpu_list(integ.test_region()))
5456
def test_hvd_gpu(
5557
mxnet_training_latest_version,
5658
mxnet_training_latest_py_version,
5759
sagemaker_session,
58-
gpu_instance_type,
5960
tmpdir,
61+
**kwargs,
6062
):
6163
_create_and_fit_estimator(
6264
mxnet_training_latest_version,
6365
mxnet_training_latest_py_version,
6466
sagemaker_session,
65-
gpu_instance_type,
67+
kwargs["instance_type"],
6668
tmpdir,
6769
)
6870

tests/integ/test_huggingface.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,13 @@ def test_framework_processing_job_with_deps(
6969
and integ.test_region() in integ.TRAINING_NO_P3_REGIONS,
7070
reason="no ml.p2 or ml.p3 instances in this region",
7171
)
72+
@retry_with_instance_list(gpu_list(integ.test_region()))
7273
def test_huggingface_training(
7374
sagemaker_session,
74-
gpu_instance_type,
7575
huggingface_training_latest_version,
7676
huggingface_training_pytorch_latest_version,
7777
huggingface_pytorch_latest_training_py_version,
78+
**kwargs,
7879
):
7980
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
8081
data_path = os.path.join(DATA_DIR, "huggingface")
@@ -86,7 +87,7 @@ def test_huggingface_training(
8687
transformers_version=huggingface_training_latest_version,
8788
pytorch_version=huggingface_training_pytorch_latest_version,
8889
instance_count=1,
89-
instance_type=gpu_instance_type,
90+
instance_type=kwargs["instance_type"],
9091
hyperparameters={
9192
"model_name_or_path": "distilbert-base-cased",
9293
"task_name": "wnli",

tests/integ/test_tf.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -182,21 +182,51 @@ def test_server_side_encryption(sagemaker_session, tf_full_version, tf_full_py_v
182182

183183

184184
@pytest.mark.release
185-
def test_mnist_distributed(
185+
def test_mnist_distributed_cpu(
186186
sagemaker_session,
187-
instance_type,
187+
cpu_instance_type,
188188
tensorflow_training_latest_version,
189189
tensorflow_training_latest_py_version,
190190
):
191+
_create_and_fit_estimator(
192+
sagemaker_session,
193+
tensorflow_training_latest_version,
194+
tensorflow_training_latest_py_version,
195+
cpu_instance_type,
196+
)
197+
198+
199+
@pytest.mark.release
200+
@pytest.mark.skipif(
201+
tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS
202+
and tests.integ.test_region() in tests.integ.TRAINING_NO_P3_REGIONS,
203+
reason="no ml.p2 or ml.p3 instances in this region",
204+
)
205+
@retry_with_instance_list(gpu_list(tests.integ.test_region()))
206+
def test_mnist_distributed_gpu(
207+
sagemaker_session,
208+
tensorflow_training_latest_version,
209+
tensorflow_training_latest_py_version,
210+
**kwargs,
211+
):
212+
_create_and_fit_estimator(
213+
sagemaker_session,
214+
tensorflow_training_latest_version,
215+
tensorflow_training_latest_py_version,
216+
kwargs["instance_type"],
217+
)
218+
219+
220+
def _create_and_fit_estimator(sagemaker_session, tf_version, py_version, instance_type):
191221
estimator = TensorFlow(
192222
entry_point=SCRIPT,
193223
source_dir=MNIST_RESOURCE_PATH,
194224
role=ROLE,
195225
instance_count=2,
196226
instance_type=instance_type,
197227
sagemaker_session=sagemaker_session,
198-
framework_version=tensorflow_training_latest_version,
199-
py_version=tensorflow_training_latest_py_version,
228+
framework_version=tf_version,
229+
py_version=py_version,
200230
distribution=PARAMETER_SERVER_DISTRIBUTION,
201231
disable_profiler=True,
202232
)

0 commit comments

Comments
 (0)