aws · chuyang-deng · Aug 28, 2019 · Aug 23, 2019 · Aug 23, 2019 · Aug 27, 2019
@@ -71,8 +71,6 @@
 NO_LDA_REGIONS = ["eu-west-3", "eu-north-1", "sa-east-1", "ap-east-1"]
 NO_MARKET_PLACE_REGIONS = ["eu-west-3", "eu-north-1", "sa-east-1", "ap-east-1"]
 
-EFS_TEST_ENABLED_REGION = ["us-west-2"]
-
 logging.getLogger("boto3").setLevel(logging.INFO)
 logging.getLogger("botocore").setLevel(logging.INFO)
 

@@ -14,6 +14,7 @@
 
 import collections
 import logging
+from operator import itemgetter
 import os
 from os import path
 import stat
@@ -27,13 +28,12 @@
 from tests.integ.vpc_test_utils import check_or_create_vpc_resources_efs_fsx
 
 VPC_NAME = "sagemaker-efs-fsx-vpc"
+ALINUX_AMI_NAME_FILTER = "amzn-ami-hvm-????.??.?.????????-x86_64-gp2"
 EFS_CREATION_TOKEN = str(uuid.uuid4())
 PREFIX = "ec2_fs_key_"
 KEY_NAME = PREFIX + str(uuid.uuid4().hex.upper()[0:8])
 ROLE_NAME = "SageMakerRole"
-REGION = "us-west-2"
 EC2_INSTANCE_TYPE = "t2.micro"
-AMI_ID = "ami-082b5a644766e0e6f"
 MIN_COUNT = 1
 MAX_COUNT = 1
 
@@ -69,12 +69,13 @@ def set_up_efs_fsx(sagemaker_session):
     _check_or_create_key_pair(sagemaker_session)
     _check_or_create_iam_profile_and_attach_role(sagemaker_session)
     subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx(
-        sagemaker_session, REGION, VPC_NAME
+        sagemaker_session, VPC_NAME
     )
 
+    ami_id = _ami_id_for_region(sagemaker_session)
     ec2_instance = _create_ec2_instance(
         sagemaker_session,
-        AMI_ID,
+        ami_id,
         EC2_INSTANCE_TYPE,
         KEY_NAME,
         MIN_COUNT,
@@ -100,16 +101,34 @@ def set_up_efs_fsx(sagemaker_session):
         mount_efs_target_id,
     )
 
+    region = sagemaker_session.boto_region_name
     try:
         connected_instance = _connect_ec2_instance(ec2_instance)
-        _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id)
+        _upload_data_and_mount_fs(
+            connected_instance, file_system_efs_id, file_system_fsx_id, region
+        )
     except Exception:
         tear_down(sagemaker_session, fs_resources)
         raise
 
     return fs_resources
 
 
+def _ami_id_for_region(sagemaker_session):
+    ec2_client = sagemaker_session.boto_session.client("ec2")
+    filters = [
+        {"Name": "name", "Values": [ALINUX_AMI_NAME_FILTER]},
+        {"Name": "state", "Values": ["available"]},
+    ]
+    response = ec2_client.describe_images(Filters=filters)
+    image_details = sorted(response["Images"], key=itemgetter("CreationDate"), reverse=True)
+
+    if len(image_details) == 0:
+        raise Exception("AMI was not found based on current search criteria: {}".format(filters))
+
+    return image_details[0]["ImageId"]
+
+
 def _connect_ec2_instance(ec2_instance):
     public_ip_address = ec2_instance.public_ip_address
     connected_instance = Connection(
@@ -118,7 +137,7 @@ def _connect_ec2_instance(ec2_instance):
     return connected_instance
 
 
-def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id):
+def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id, region):
     connected_instance.put(FS_MOUNT_SCRIPT, ".")
     connected_instance.run("mkdir temp_tf; mkdir temp_one_p", in_stream=False)
     for dir_name, subdir_list, file_list in os.walk(MNIST_LOCAL_DATA):
@@ -127,7 +146,7 @@ def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_syste
             connected_instance.put(local_file, "temp_tf/")
     connected_instance.put(ONE_P_LOCAL_DATA, "temp_one_p/")
     connected_instance.run(
-        "sudo sh fs_mount_setup.sh {} {}".format(file_system_efs_id, file_system_fsx_id),
+        "sudo sh fs_mount_setup.sh {} {} {}".format(file_system_efs_id, file_system_fsx_id, region),
         in_stream=False,
     )
 
@@ -168,7 +187,7 @@ def _check_or_create_efs(sagemaker_session):
 
 def _create_efs_mount(sagemaker_session, file_system_id):
     subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx(
-        sagemaker_session, REGION, VPC_NAME
+        sagemaker_session, VPC_NAME
     )
     efs_client = sagemaker_session.boto_session.client("efs")
     mount_response = efs_client.create_mount_target(
@@ -188,7 +207,7 @@ def _create_efs_mount(sagemaker_session, file_system_id):
 def _check_or_create_fsx(sagemaker_session):
     fsx_client = sagemaker_session.boto_session.client("fsx")
     subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx(
-        sagemaker_session, REGION, VPC_NAME
+        sagemaker_session, VPC_NAME
     )
     create_response = fsx_client.create_file_system(
         FileSystemType="LUSTRE",

@@ -14,7 +14,6 @@
 
 import pytest
 
-import tests.integ
 from sagemaker import KMeans
 from sagemaker.amazon.amazon_estimator import FileSystemRecordSet
 from sagemaker.parameter import IntegerParameter, CategoricalParameter
@@ -25,7 +24,6 @@
 from tests.integ.s3_utils import assert_s3_files_exist
 from tests.integ.timeout import timeout
 
-TRAIN_INSTANCE_TYPE = "ml.c4.xlarge"
 TRAIN_INSTANCE_COUNT = 1
 OBJECTIVE_METRIC_NAME = "test:msd"
 EFS_DIR_PATH = "/one_p_mnist"
@@ -46,19 +44,15 @@ def efs_fsx_setup(sagemaker_session):
         tear_down(sagemaker_session, fs_resources)
 
 
-@pytest.mark.skipif(
-    tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
-    reason="EFS integration tests need to be fixed before running in all regions.",
-)
-def test_kmeans_efs(efs_fsx_setup, sagemaker_session):
+def test_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
         subnets = [efs_fsx_setup.subnet_id]
         security_group_ids = efs_fsx_setup.security_group_ids
         role = efs_fsx_setup.role_name
         kmeans = KMeans(
             role=role,
             train_instance_count=TRAIN_INSTANCE_COUNT,
-            train_instance_type=TRAIN_INSTANCE_TYPE,
+            train_instance_type=cpu_instance_type,
             k=K,
             sagemaker_session=sagemaker_session,
             subnets=subnets,
@@ -80,19 +74,15 @@ def test_kmeans_efs(efs_fsx_setup, sagemaker_session):
         assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])
 
 
-@pytest.mark.skipif(
-    tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
-    reason="EFS integration tests need to be fixed before running in all regions.",
-)
-def test_kmeans_fsx(efs_fsx_setup, sagemaker_session):
+def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
         subnets = [efs_fsx_setup.subnet_id]
         security_group_ids = efs_fsx_setup.security_group_ids
         role = efs_fsx_setup.role_name
         kmeans = KMeans(
             role=role,
             train_instance_count=TRAIN_INSTANCE_COUNT,
-            train_instance_type=TRAIN_INSTANCE_TYPE,
+            train_instance_type=cpu_instance_type,
             k=K,
             sagemaker_session=sagemaker_session,
             subnets=subnets,
@@ -114,18 +104,14 @@ def test_kmeans_fsx(efs_fsx_setup, sagemaker_session):
         assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])
 
 
-@pytest.mark.skipif(
-    tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
-    reason="EFS integration tests need to be fixed before running in all regions.",
-)
-def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session):
+def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     subnets = [efs_fsx_setup.subnet_id]
     security_group_ids = efs_fsx_setup.security_group_ids
     role = efs_fsx_setup.role_name
     kmeans = KMeans(
         role=role,
         train_instance_count=TRAIN_INSTANCE_COUNT,
-        train_instance_type=TRAIN_INSTANCE_TYPE,
+        train_instance_type=cpu_instance_type,
         k=K,
         sagemaker_session=sagemaker_session,
         subnets=subnets,
@@ -174,18 +160,14 @@ def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session):
         assert best_training_job
 
 
-@pytest.mark.skipif(
-    tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
-    reason="EFS integration tests need to be fixed before running in all regions.",
-)
-def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session):
+def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     subnets = [efs_fsx_setup.subnet_id]
     security_group_ids = efs_fsx_setup.security_group_ids
     role = efs_fsx_setup.role_name
     kmeans = KMeans(
         role=role,
         train_instance_count=TRAIN_INSTANCE_COUNT,
-        train_instance_type=TRAIN_INSTANCE_TYPE,
+        train_instance_type=cpu_instance_type,
         k=K,
         sagemaker_session=sagemaker_session,
         subnets=subnets,

@@ -17,7 +17,6 @@
 
 import pytest
 
-import tests.integ
 from sagemaker.inputs import FileSystemInput
 from sagemaker.parameter import IntegerParameter
 from sagemaker.tensorflow import TensorFlow
@@ -32,7 +31,6 @@
 MNIST_RESOURCE_PATH = os.path.join(RESOURCE_PATH, "tensorflow_mnist")
 SCRIPT = os.path.join(MNIST_RESOURCE_PATH, "mnist.py")
 TFS_RESOURCE_PATH = os.path.join(RESOURCE_PATH, "tfs", "tfs-test-entrypoint-with-handler")
-INSTANCE_TYPE = "ml.c4.xlarge"
 EFS_DIR_PATH = "/tensorflow"
 FSX_DIR_PATH = "/fsx/tensorflow"
 MAX_JOBS = 2
@@ -49,11 +47,7 @@ def efs_fsx_setup(sagemaker_session):
         tear_down(sagemaker_session, fs_resources)
 
 
-@pytest.mark.skipif(
-    tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
-    reason="EFS integration tests need to be fixed before running in all regions.",
-)
-def test_mnist_efs(efs_fsx_setup, sagemaker_session):
+def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     role = efs_fsx_setup.role_name
     subnets = [efs_fsx_setup.subnet_id]
     security_group_ids = efs_fsx_setup.security_group_ids
@@ -62,7 +56,7 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session):
         entry_point=SCRIPT,
         role=role,
         train_instance_count=1,
-        train_instance_type=INSTANCE_TYPE,
+        train_instance_type=cpu_instance_type,
         sagemaker_session=sagemaker_session,
         script_mode=True,
         framework_version=TensorFlow.LATEST_VERSION,
@@ -85,11 +79,7 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session):
     )
 
 
-@pytest.mark.skipif(
-    tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
-    reason="EFS integration tests need to be fixed before running in all regions.",
-)
-def test_mnist_lustre(efs_fsx_setup, sagemaker_session):
+def test_mnist_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     role = efs_fsx_setup.role_name
     subnets = [efs_fsx_setup.subnet_id]
     security_group_ids = efs_fsx_setup.security_group_ids
@@ -98,7 +88,7 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session):
         entry_point=SCRIPT,
         role=role,
         train_instance_count=1,
-        train_instance_type=INSTANCE_TYPE,
+        train_instance_type=cpu_instance_type,
         sagemaker_session=sagemaker_session,
         script_mode=True,
         framework_version=TensorFlow.LATEST_VERSION,
@@ -121,11 +111,7 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session):
     )
 
 
-@pytest.mark.skipif(
-    tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
-    reason="EFS integration tests need to be fixed before running in all regions.",
-)
-def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session):
+def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     role = efs_fsx_setup.role_name
     subnets = [efs_fsx_setup.subnet_id]
     security_group_ids = efs_fsx_setup.security_group_ids
@@ -134,7 +120,7 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session):
         entry_point=SCRIPT,
         role=role,
         train_instance_count=1,
-        train_instance_type=INSTANCE_TYPE,
+        train_instance_type=cpu_instance_type,
         script_mode=True,
         sagemaker_session=sagemaker_session,
         py_version=PY_VERSION,
@@ -169,11 +155,7 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session):
     assert best_training_job
 
 
-@pytest.mark.skipif(
-    tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
-    reason="EFS integration tests need to be fixed before running in all regions.",
-)
-def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session):
+def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
     role = efs_fsx_setup.role_name
     subnets = [efs_fsx_setup.subnet_id]
     security_group_ids = efs_fsx_setup.security_group_ids
@@ -182,7 +164,7 @@ def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session):
         entry_point=SCRIPT,
         role=role,
         train_instance_count=1,
-        train_instance_type=INSTANCE_TYPE,
+        train_instance_type=cpu_instance_type,
         script_mode=True,
         sagemaker_session=sagemaker_session,
         py_version=PY_VERSION,

@@ -62,7 +62,7 @@ def _route_table_id(ec2_client, vpc_id):
     return desc["RouteTables"][0]["RouteTableId"]
 
 
-def check_or_create_vpc_resources_efs_fsx(sagemaker_session, region, name=VPC_NAME):
+def check_or_create_vpc_resources_efs_fsx(sagemaker_session, name=VPC_NAME):
     # use lock to prevent race condition when tests are running concurrently
     with lock.lock(LOCK_PATH):
         ec2_client = sagemaker_session.boto_session.client("ec2")
@@ -74,13 +74,11 @@ def check_or_create_vpc_resources_efs_fsx(sagemaker_session, region, name=VPC_NA
                 _security_group_ids_by_vpc_id(sagemaker_session, vpc_id),
             )
         else:
-            return _create_vpc_with_name_efs_fsx(ec2_client, region, name)
+            return _create_vpc_with_name_efs_fsx(ec2_client, name)
 
 
-def _create_vpc_with_name_efs_fsx(ec2_client, region, name):
-    vpc_id, [subnet_id_a, subnet_id_b], security_group_id = _create_vpc_resources(
-        ec2_client, region, name
-    )
+def _create_vpc_with_name_efs_fsx(ec2_client, name):
+    vpc_id, [subnet_id_a, subnet_id_b], security_group_id = _create_vpc_resources(ec2_client, name)
     ec2_client.modify_vpc_attribute(EnableDnsHostnames={"Value": True}, VpcId=vpc_id)
 
     ig = ec2_client.create_internet_gateway()
@@ -121,7 +119,7 @@ def _create_vpc_with_name_efs_fsx(ec2_client, region, name):
     return [subnet_id_a], [security_group_id]
 
 
-def _create_vpc_resources(ec2_client, region, name):
+def _create_vpc_resources(ec2_client, name):
     vpc_id = ec2_client.create_vpc(CidrBlock="10.0.0.0/16")["Vpc"]["VpcId"]
     print("created vpc: {}".format(vpc_id))
 

@@ -16,18 +16,19 @@
 # Mounting EFS and FSx for Lustre file systems for integration Tests
 FILE_SYSTEM_EFS_ID=$1
 FILE_SYSTEM_FSX_ID=$2
+REGION=$3
 
 echo "Mounting EFS File Systems"
-sudo yum install -y amazon-efs-utils.noarch 0:1.10-1.amzn2
+sudo yum install -y amazon-efs-utils
 sudo mkdir efs
 sudo mount -t efs "$FILE_SYSTEM_EFS_ID":/ efs
 sudo mkdir efs/tensorflow
 sudo mkdir efs/one_p_mnist
 
 echo "Mounting FSx for Lustre File System"
-sudo amazon-linux-extras install -y lustre2.10
+sudo yum install -y lustre-client
 sudo mkdir -p /mnt/fsx
-sudo mount -t lustre -o noatime,flock "$FILE_SYSTEM_FSX_ID".fsx.us-west-2.amazonaws.com@tcp:/fsx /mnt/fsx
+sudo mount -t lustre -o noatime,flock "$FILE_SYSTEM_FSX_ID".fsx."$REGION".amazonaws.com@tcp:/fsx /mnt/fsx
 sudo mkdir /mnt/fsx/tensorflow
 sudo mkdir /mnt/fsx/one_p_mnist