Skip to content

Commit f233dc0

Browse files
caxiaohuchuyang-deng
authored andcommitted
fix: change AMI ids in tests to be dynamic based on regions (#1004)
* fix: change Amazon AMI ids to be dynamic based on regions using searching Amazon Linux AMI
1 parent af6f943 commit f233dc0

File tree

6 files changed

+53
-73
lines changed

6 files changed

+53
-73
lines changed

tests/integ/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,6 @@
7171
NO_LDA_REGIONS = ["eu-west-3", "eu-north-1", "sa-east-1", "ap-east-1"]
7272
NO_MARKET_PLACE_REGIONS = ["eu-west-3", "eu-north-1", "sa-east-1", "ap-east-1"]
7373

74-
EFS_TEST_ENABLED_REGION = ["us-west-2"]
75-
7674
logging.getLogger("boto3").setLevel(logging.INFO)
7775
logging.getLogger("botocore").setLevel(logging.INFO)
7876

tests/integ/file_system_input_utils.py

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import collections
1616
import logging
17+
from operator import itemgetter
1718
import os
1819
from os import path
1920
import stat
@@ -27,13 +28,12 @@
2728
from tests.integ.vpc_test_utils import check_or_create_vpc_resources_efs_fsx
2829

2930
VPC_NAME = "sagemaker-efs-fsx-vpc"
31+
ALINUX_AMI_NAME_FILTER = "amzn-ami-hvm-????.??.?.????????-x86_64-gp2"
3032
EFS_CREATION_TOKEN = str(uuid.uuid4())
3133
PREFIX = "ec2_fs_key_"
3234
KEY_NAME = PREFIX + str(uuid.uuid4().hex.upper()[0:8])
3335
ROLE_NAME = "SageMakerRole"
34-
REGION = "us-west-2"
3536
EC2_INSTANCE_TYPE = "t2.micro"
36-
AMI_ID = "ami-082b5a644766e0e6f"
3737
MIN_COUNT = 1
3838
MAX_COUNT = 1
3939

@@ -69,12 +69,13 @@ def set_up_efs_fsx(sagemaker_session):
6969
_check_or_create_key_pair(sagemaker_session)
7070
_check_or_create_iam_profile_and_attach_role(sagemaker_session)
7171
subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx(
72-
sagemaker_session, REGION, VPC_NAME
72+
sagemaker_session, VPC_NAME
7373
)
7474

75+
ami_id = _ami_id_for_region(sagemaker_session)
7576
ec2_instance = _create_ec2_instance(
7677
sagemaker_session,
77-
AMI_ID,
78+
ami_id,
7879
EC2_INSTANCE_TYPE,
7980
KEY_NAME,
8081
MIN_COUNT,
@@ -100,16 +101,34 @@ def set_up_efs_fsx(sagemaker_session):
100101
mount_efs_target_id,
101102
)
102103

104+
region = sagemaker_session.boto_region_name
103105
try:
104106
connected_instance = _connect_ec2_instance(ec2_instance)
105-
_upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id)
107+
_upload_data_and_mount_fs(
108+
connected_instance, file_system_efs_id, file_system_fsx_id, region
109+
)
106110
except Exception:
107111
tear_down(sagemaker_session, fs_resources)
108112
raise
109113

110114
return fs_resources
111115

112116

117+
def _ami_id_for_region(sagemaker_session):
118+
ec2_client = sagemaker_session.boto_session.client("ec2")
119+
filters = [
120+
{"Name": "name", "Values": [ALINUX_AMI_NAME_FILTER]},
121+
{"Name": "state", "Values": ["available"]},
122+
]
123+
response = ec2_client.describe_images(Filters=filters)
124+
image_details = sorted(response["Images"], key=itemgetter("CreationDate"), reverse=True)
125+
126+
if len(image_details) == 0:
127+
raise Exception("AMI was not found based on current search criteria: {}".format(filters))
128+
129+
return image_details[0]["ImageId"]
130+
131+
113132
def _connect_ec2_instance(ec2_instance):
114133
public_ip_address = ec2_instance.public_ip_address
115134
connected_instance = Connection(
@@ -118,7 +137,7 @@ def _connect_ec2_instance(ec2_instance):
118137
return connected_instance
119138

120139

121-
def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id):
140+
def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id, region):
122141
connected_instance.put(FS_MOUNT_SCRIPT, ".")
123142
connected_instance.run("mkdir temp_tf; mkdir temp_one_p", in_stream=False)
124143
for dir_name, subdir_list, file_list in os.walk(MNIST_LOCAL_DATA):
@@ -127,7 +146,7 @@ def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_syste
127146
connected_instance.put(local_file, "temp_tf/")
128147
connected_instance.put(ONE_P_LOCAL_DATA, "temp_one_p/")
129148
connected_instance.run(
130-
"sudo sh fs_mount_setup.sh {} {}".format(file_system_efs_id, file_system_fsx_id),
149+
"sudo sh fs_mount_setup.sh {} {} {}".format(file_system_efs_id, file_system_fsx_id, region),
131150
in_stream=False,
132151
)
133152

@@ -168,7 +187,7 @@ def _check_or_create_efs(sagemaker_session):
168187

169188
def _create_efs_mount(sagemaker_session, file_system_id):
170189
subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx(
171-
sagemaker_session, REGION, VPC_NAME
190+
sagemaker_session, VPC_NAME
172191
)
173192
efs_client = sagemaker_session.boto_session.client("efs")
174193
mount_response = efs_client.create_mount_target(
@@ -188,7 +207,7 @@ def _create_efs_mount(sagemaker_session, file_system_id):
188207
def _check_or_create_fsx(sagemaker_session):
189208
fsx_client = sagemaker_session.boto_session.client("fsx")
190209
subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx(
191-
sagemaker_session, REGION, VPC_NAME
210+
sagemaker_session, VPC_NAME
192211
)
193212
create_response = fsx_client.create_file_system(
194213
FileSystemType="LUSTRE",

tests/integ/test_kmeans_efs_fsx.py

Lines changed: 8 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
import pytest
1616

17-
import tests.integ
1817
from sagemaker import KMeans
1918
from sagemaker.amazon.amazon_estimator import FileSystemRecordSet
2019
from sagemaker.parameter import IntegerParameter, CategoricalParameter
@@ -25,7 +24,6 @@
2524
from tests.integ.s3_utils import assert_s3_files_exist
2625
from tests.integ.timeout import timeout
2726

28-
TRAIN_INSTANCE_TYPE = "ml.c4.xlarge"
2927
TRAIN_INSTANCE_COUNT = 1
3028
OBJECTIVE_METRIC_NAME = "test:msd"
3129
EFS_DIR_PATH = "/one_p_mnist"
@@ -46,19 +44,15 @@ def efs_fsx_setup(sagemaker_session):
4644
tear_down(sagemaker_session, fs_resources)
4745

4846

49-
@pytest.mark.skipif(
50-
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
51-
reason="EFS integration tests need to be fixed before running in all regions.",
52-
)
53-
def test_kmeans_efs(efs_fsx_setup, sagemaker_session):
47+
def test_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
5448
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
5549
subnets = [efs_fsx_setup.subnet_id]
5650
security_group_ids = efs_fsx_setup.security_group_ids
5751
role = efs_fsx_setup.role_name
5852
kmeans = KMeans(
5953
role=role,
6054
train_instance_count=TRAIN_INSTANCE_COUNT,
61-
train_instance_type=TRAIN_INSTANCE_TYPE,
55+
train_instance_type=cpu_instance_type,
6256
k=K,
6357
sagemaker_session=sagemaker_session,
6458
subnets=subnets,
@@ -80,19 +74,15 @@ def test_kmeans_efs(efs_fsx_setup, sagemaker_session):
8074
assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])
8175

8276

83-
@pytest.mark.skipif(
84-
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
85-
reason="EFS integration tests need to be fixed before running in all regions.",
86-
)
87-
def test_kmeans_fsx(efs_fsx_setup, sagemaker_session):
77+
def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
8878
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
8979
subnets = [efs_fsx_setup.subnet_id]
9080
security_group_ids = efs_fsx_setup.security_group_ids
9181
role = efs_fsx_setup.role_name
9282
kmeans = KMeans(
9383
role=role,
9484
train_instance_count=TRAIN_INSTANCE_COUNT,
95-
train_instance_type=TRAIN_INSTANCE_TYPE,
85+
train_instance_type=cpu_instance_type,
9686
k=K,
9787
sagemaker_session=sagemaker_session,
9888
subnets=subnets,
@@ -114,18 +104,14 @@ def test_kmeans_fsx(efs_fsx_setup, sagemaker_session):
114104
assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])
115105

116106

117-
@pytest.mark.skipif(
118-
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
119-
reason="EFS integration tests need to be fixed before running in all regions.",
120-
)
121-
def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session):
107+
def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
122108
subnets = [efs_fsx_setup.subnet_id]
123109
security_group_ids = efs_fsx_setup.security_group_ids
124110
role = efs_fsx_setup.role_name
125111
kmeans = KMeans(
126112
role=role,
127113
train_instance_count=TRAIN_INSTANCE_COUNT,
128-
train_instance_type=TRAIN_INSTANCE_TYPE,
114+
train_instance_type=cpu_instance_type,
129115
k=K,
130116
sagemaker_session=sagemaker_session,
131117
subnets=subnets,
@@ -174,18 +160,14 @@ def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session):
174160
assert best_training_job
175161

176162

177-
@pytest.mark.skipif(
178-
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
179-
reason="EFS integration tests need to be fixed before running in all regions.",
180-
)
181-
def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session):
163+
def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
182164
subnets = [efs_fsx_setup.subnet_id]
183165
security_group_ids = efs_fsx_setup.security_group_ids
184166
role = efs_fsx_setup.role_name
185167
kmeans = KMeans(
186168
role=role,
187169
train_instance_count=TRAIN_INSTANCE_COUNT,
188-
train_instance_type=TRAIN_INSTANCE_TYPE,
170+
train_instance_type=cpu_instance_type,
189171
k=K,
190172
sagemaker_session=sagemaker_session,
191173
subnets=subnets,

tests/integ/test_tf_efs_fsx.py

Lines changed: 8 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717

1818
import pytest
1919

20-
import tests.integ
2120
from sagemaker.inputs import FileSystemInput
2221
from sagemaker.parameter import IntegerParameter
2322
from sagemaker.tensorflow import TensorFlow
@@ -32,7 +31,6 @@
3231
MNIST_RESOURCE_PATH = os.path.join(RESOURCE_PATH, "tensorflow_mnist")
3332
SCRIPT = os.path.join(MNIST_RESOURCE_PATH, "mnist.py")
3433
TFS_RESOURCE_PATH = os.path.join(RESOURCE_PATH, "tfs", "tfs-test-entrypoint-with-handler")
35-
INSTANCE_TYPE = "ml.c4.xlarge"
3634
EFS_DIR_PATH = "/tensorflow"
3735
FSX_DIR_PATH = "/fsx/tensorflow"
3836
MAX_JOBS = 2
@@ -49,11 +47,7 @@ def efs_fsx_setup(sagemaker_session):
4947
tear_down(sagemaker_session, fs_resources)
5048

5149

52-
@pytest.mark.skipif(
53-
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
54-
reason="EFS integration tests need to be fixed before running in all regions.",
55-
)
56-
def test_mnist_efs(efs_fsx_setup, sagemaker_session):
50+
def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
5751
role = efs_fsx_setup.role_name
5852
subnets = [efs_fsx_setup.subnet_id]
5953
security_group_ids = efs_fsx_setup.security_group_ids
@@ -62,7 +56,7 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session):
6256
entry_point=SCRIPT,
6357
role=role,
6458
train_instance_count=1,
65-
train_instance_type=INSTANCE_TYPE,
59+
train_instance_type=cpu_instance_type,
6660
sagemaker_session=sagemaker_session,
6761
script_mode=True,
6862
framework_version=TensorFlow.LATEST_VERSION,
@@ -85,11 +79,7 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session):
8579
)
8680

8781

88-
@pytest.mark.skipif(
89-
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
90-
reason="EFS integration tests need to be fixed before running in all regions.",
91-
)
92-
def test_mnist_lustre(efs_fsx_setup, sagemaker_session):
82+
def test_mnist_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
9383
role = efs_fsx_setup.role_name
9484
subnets = [efs_fsx_setup.subnet_id]
9585
security_group_ids = efs_fsx_setup.security_group_ids
@@ -98,7 +88,7 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session):
9888
entry_point=SCRIPT,
9989
role=role,
10090
train_instance_count=1,
101-
train_instance_type=INSTANCE_TYPE,
91+
train_instance_type=cpu_instance_type,
10292
sagemaker_session=sagemaker_session,
10393
script_mode=True,
10494
framework_version=TensorFlow.LATEST_VERSION,
@@ -121,11 +111,7 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session):
121111
)
122112

123113

124-
@pytest.mark.skipif(
125-
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
126-
reason="EFS integration tests need to be fixed before running in all regions.",
127-
)
128-
def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session):
114+
def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
129115
role = efs_fsx_setup.role_name
130116
subnets = [efs_fsx_setup.subnet_id]
131117
security_group_ids = efs_fsx_setup.security_group_ids
@@ -134,7 +120,7 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session):
134120
entry_point=SCRIPT,
135121
role=role,
136122
train_instance_count=1,
137-
train_instance_type=INSTANCE_TYPE,
123+
train_instance_type=cpu_instance_type,
138124
script_mode=True,
139125
sagemaker_session=sagemaker_session,
140126
py_version=PY_VERSION,
@@ -169,11 +155,7 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session):
169155
assert best_training_job
170156

171157

172-
@pytest.mark.skipif(
173-
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
174-
reason="EFS integration tests need to be fixed before running in all regions.",
175-
)
176-
def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session):
158+
def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
177159
role = efs_fsx_setup.role_name
178160
subnets = [efs_fsx_setup.subnet_id]
179161
security_group_ids = efs_fsx_setup.security_group_ids
@@ -182,7 +164,7 @@ def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session):
182164
entry_point=SCRIPT,
183165
role=role,
184166
train_instance_count=1,
185-
train_instance_type=INSTANCE_TYPE,
167+
train_instance_type=cpu_instance_type,
186168
script_mode=True,
187169
sagemaker_session=sagemaker_session,
188170
py_version=PY_VERSION,

tests/integ/vpc_test_utils.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def _route_table_id(ec2_client, vpc_id):
6262
return desc["RouteTables"][0]["RouteTableId"]
6363

6464

65-
def check_or_create_vpc_resources_efs_fsx(sagemaker_session, region, name=VPC_NAME):
65+
def check_or_create_vpc_resources_efs_fsx(sagemaker_session, name=VPC_NAME):
6666
# use lock to prevent race condition when tests are running concurrently
6767
with lock.lock(LOCK_PATH):
6868
ec2_client = sagemaker_session.boto_session.client("ec2")
@@ -74,13 +74,11 @@ def check_or_create_vpc_resources_efs_fsx(sagemaker_session, region, name=VPC_NA
7474
_security_group_ids_by_vpc_id(sagemaker_session, vpc_id),
7575
)
7676
else:
77-
return _create_vpc_with_name_efs_fsx(ec2_client, region, name)
77+
return _create_vpc_with_name_efs_fsx(ec2_client, name)
7878

7979

80-
def _create_vpc_with_name_efs_fsx(ec2_client, region, name):
81-
vpc_id, [subnet_id_a, subnet_id_b], security_group_id = _create_vpc_resources(
82-
ec2_client, region, name
83-
)
80+
def _create_vpc_with_name_efs_fsx(ec2_client, name):
81+
vpc_id, [subnet_id_a, subnet_id_b], security_group_id = _create_vpc_resources(ec2_client, name)
8482
ec2_client.modify_vpc_attribute(EnableDnsHostnames={"Value": True}, VpcId=vpc_id)
8583

8684
ig = ec2_client.create_internet_gateway()
@@ -121,7 +119,7 @@ def _create_vpc_with_name_efs_fsx(ec2_client, region, name):
121119
return [subnet_id_a], [security_group_id]
122120

123121

124-
def _create_vpc_resources(ec2_client, region, name):
122+
def _create_vpc_resources(ec2_client, name):
125123
vpc_id = ec2_client.create_vpc(CidrBlock="10.0.0.0/16")["Vpc"]["VpcId"]
126124
print("created vpc: {}".format(vpc_id))
127125

tests/scripts/fs_mount_setup.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,19 @@
1616
# Mounting EFS and FSx for Lustre file systems for integration Tests
1717
FILE_SYSTEM_EFS_ID=$1
1818
FILE_SYSTEM_FSX_ID=$2
19+
REGION=$3
1920

2021
echo "Mounting EFS File Systems"
21-
sudo yum install -y amazon-efs-utils.noarch 0:1.10-1.amzn2
22+
sudo yum install -y amazon-efs-utils
2223
sudo mkdir efs
2324
sudo mount -t efs "$FILE_SYSTEM_EFS_ID":/ efs
2425
sudo mkdir efs/tensorflow
2526
sudo mkdir efs/one_p_mnist
2627

2728
echo "Mounting FSx for Lustre File System"
28-
sudo amazon-linux-extras install -y lustre2.10
29+
sudo yum install -y lustre-client
2930
sudo mkdir -p /mnt/fsx
30-
sudo mount -t lustre -o noatime,flock "$FILE_SYSTEM_FSX_ID".fsx.us-west-2.amazonaws.com@tcp:/fsx /mnt/fsx
31+
sudo mount -t lustre -o noatime,flock "$FILE_SYSTEM_FSX_ID".fsx."$REGION".amazonaws.com@tcp:/fsx /mnt/fsx
3132
sudo mkdir /mnt/fsx/tensorflow
3233
sudo mkdir /mnt/fsx/one_p_mnist
3334

0 commit comments

Comments
 (0)