Skip to content

fix: change AMI ids in tests to be dynamic based on regions #1004

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Aug 28, 2019
Merged
2 changes: 0 additions & 2 deletions tests/integ/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,6 @@
NO_LDA_REGIONS = ["eu-west-3", "eu-north-1", "sa-east-1", "ap-east-1"]
NO_MARKET_PLACE_REGIONS = ["eu-west-3", "eu-north-1", "sa-east-1", "ap-east-1"]

EFS_TEST_ENABLED_REGION = ["us-west-2"]

logging.getLogger("boto3").setLevel(logging.INFO)
logging.getLogger("botocore").setLevel(logging.INFO)

Expand Down
37 changes: 28 additions & 9 deletions tests/integ/file_system_input_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import collections
import logging
from operator import itemgetter
import os
from os import path
import stat
Expand All @@ -27,13 +28,12 @@
from tests.integ.vpc_test_utils import check_or_create_vpc_resources_efs_fsx

VPC_NAME = "sagemaker-efs-fsx-vpc"
ALINUX_AMI_NAME_FILTER = "amzn-ami-hvm-????.??.?.????????-x86_64-gp2"
EFS_CREATION_TOKEN = str(uuid.uuid4())
PREFIX = "ec2_fs_key_"
KEY_NAME = PREFIX + str(uuid.uuid4().hex.upper()[0:8])
ROLE_NAME = "SageMakerRole"
REGION = "us-west-2"
EC2_INSTANCE_TYPE = "t2.micro"
AMI_ID = "ami-082b5a644766e0e6f"
MIN_COUNT = 1
MAX_COUNT = 1

Expand Down Expand Up @@ -69,12 +69,13 @@ def set_up_efs_fsx(sagemaker_session):
_check_or_create_key_pair(sagemaker_session)
_check_or_create_iam_profile_and_attach_role(sagemaker_session)
subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx(
sagemaker_session, REGION, VPC_NAME
sagemaker_session, VPC_NAME
)

ami_id = _ami_id_for_region(sagemaker_session)
ec2_instance = _create_ec2_instance(
sagemaker_session,
AMI_ID,
ami_id,
EC2_INSTANCE_TYPE,
KEY_NAME,
MIN_COUNT,
Expand All @@ -100,16 +101,34 @@ def set_up_efs_fsx(sagemaker_session):
mount_efs_target_id,
)

region = sagemaker_session.boto_region_name
try:
connected_instance = _connect_ec2_instance(ec2_instance)
_upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id)
_upload_data_and_mount_fs(
connected_instance, file_system_efs_id, file_system_fsx_id, region
)
except Exception:
tear_down(sagemaker_session, fs_resources)
raise

return fs_resources


def _ami_id_for_region(sagemaker_session):
ec2_client = sagemaker_session.boto_session.client("ec2")
filters = [
{"Name": "name", "Values": [ALINUX_AMI_NAME_FILTER]},
{"Name": "state", "Values": ["available"]},
]
response = ec2_client.describe_images(Filters=filters)
image_details = sorted(response["Images"], key=itemgetter("CreationDate"), reverse=True)

if len(image_details) == 0:
raise Exception("AMI was not found based on current search criteria: {}".format(filters))

return image_details[0]["ImageId"]


def _connect_ec2_instance(ec2_instance):
public_ip_address = ec2_instance.public_ip_address
connected_instance = Connection(
Expand All @@ -118,7 +137,7 @@ def _connect_ec2_instance(ec2_instance):
return connected_instance


def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id):
def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_system_fsx_id, region):
connected_instance.put(FS_MOUNT_SCRIPT, ".")
connected_instance.run("mkdir temp_tf; mkdir temp_one_p", in_stream=False)
for dir_name, subdir_list, file_list in os.walk(MNIST_LOCAL_DATA):
Expand All @@ -127,7 +146,7 @@ def _upload_data_and_mount_fs(connected_instance, file_system_efs_id, file_syste
connected_instance.put(local_file, "temp_tf/")
connected_instance.put(ONE_P_LOCAL_DATA, "temp_one_p/")
connected_instance.run(
"sudo sh fs_mount_setup.sh {} {}".format(file_system_efs_id, file_system_fsx_id),
"sudo sh fs_mount_setup.sh {} {} {}".format(file_system_efs_id, file_system_fsx_id, region),
in_stream=False,
)

Expand Down Expand Up @@ -168,7 +187,7 @@ def _check_or_create_efs(sagemaker_session):

def _create_efs_mount(sagemaker_session, file_system_id):
subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx(
sagemaker_session, REGION, VPC_NAME
sagemaker_session, VPC_NAME
)
efs_client = sagemaker_session.boto_session.client("efs")
mount_response = efs_client.create_mount_target(
Expand All @@ -188,7 +207,7 @@ def _create_efs_mount(sagemaker_session, file_system_id):
def _check_or_create_fsx(sagemaker_session):
fsx_client = sagemaker_session.boto_session.client("fsx")
subnet_ids, security_group_ids = check_or_create_vpc_resources_efs_fsx(
sagemaker_session, REGION, VPC_NAME
sagemaker_session, VPC_NAME
)
create_response = fsx_client.create_file_system(
FileSystemType="LUSTRE",
Expand Down
34 changes: 8 additions & 26 deletions tests/integ/test_kmeans_efs_fsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

import pytest

import tests.integ
from sagemaker import KMeans
from sagemaker.amazon.amazon_estimator import FileSystemRecordSet
from sagemaker.parameter import IntegerParameter, CategoricalParameter
Expand All @@ -25,7 +24,6 @@
from tests.integ.s3_utils import assert_s3_files_exist
from tests.integ.timeout import timeout

TRAIN_INSTANCE_TYPE = "ml.c4.xlarge"
TRAIN_INSTANCE_COUNT = 1
OBJECTIVE_METRIC_NAME = "test:msd"
EFS_DIR_PATH = "/one_p_mnist"
Expand All @@ -46,19 +44,15 @@ def efs_fsx_setup(sagemaker_session):
tear_down(sagemaker_session, fs_resources)


@pytest.mark.skipif(
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_kmeans_efs(efs_fsx_setup, sagemaker_session):
def test_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
subnets = [efs_fsx_setup.subnet_id]
security_group_ids = efs_fsx_setup.security_group_ids
role = efs_fsx_setup.role_name
kmeans = KMeans(
role=role,
train_instance_count=TRAIN_INSTANCE_COUNT,
train_instance_type=TRAIN_INSTANCE_TYPE,
train_instance_type=cpu_instance_type,
k=K,
sagemaker_session=sagemaker_session,
subnets=subnets,
Expand All @@ -80,19 +74,15 @@ def test_kmeans_efs(efs_fsx_setup, sagemaker_session):
assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])


@pytest.mark.skipif(
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_kmeans_fsx(efs_fsx_setup, sagemaker_session):
def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
subnets = [efs_fsx_setup.subnet_id]
security_group_ids = efs_fsx_setup.security_group_ids
role = efs_fsx_setup.role_name
kmeans = KMeans(
role=role,
train_instance_count=TRAIN_INSTANCE_COUNT,
train_instance_type=TRAIN_INSTANCE_TYPE,
train_instance_type=cpu_instance_type,
k=K,
sagemaker_session=sagemaker_session,
subnets=subnets,
Expand All @@ -114,18 +104,14 @@ def test_kmeans_fsx(efs_fsx_setup, sagemaker_session):
assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])


@pytest.mark.skipif(
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session):
def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
subnets = [efs_fsx_setup.subnet_id]
security_group_ids = efs_fsx_setup.security_group_ids
role = efs_fsx_setup.role_name
kmeans = KMeans(
role=role,
train_instance_count=TRAIN_INSTANCE_COUNT,
train_instance_type=TRAIN_INSTANCE_TYPE,
train_instance_type=cpu_instance_type,
k=K,
sagemaker_session=sagemaker_session,
subnets=subnets,
Expand Down Expand Up @@ -174,18 +160,14 @@ def test_tuning_kmeans_efs(efs_fsx_setup, sagemaker_session):
assert best_training_job


@pytest.mark.skipif(
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session):
def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type):
subnets = [efs_fsx_setup.subnet_id]
security_group_ids = efs_fsx_setup.security_group_ids
role = efs_fsx_setup.role_name
kmeans = KMeans(
role=role,
train_instance_count=TRAIN_INSTANCE_COUNT,
train_instance_type=TRAIN_INSTANCE_TYPE,
train_instance_type=cpu_instance_type,
k=K,
sagemaker_session=sagemaker_session,
subnets=subnets,
Expand Down
34 changes: 8 additions & 26 deletions tests/integ/test_tf_efs_fsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

import pytest

import tests.integ
from sagemaker.inputs import FileSystemInput
from sagemaker.parameter import IntegerParameter
from sagemaker.tensorflow import TensorFlow
Expand All @@ -32,7 +31,6 @@
MNIST_RESOURCE_PATH = os.path.join(RESOURCE_PATH, "tensorflow_mnist")
SCRIPT = os.path.join(MNIST_RESOURCE_PATH, "mnist.py")
TFS_RESOURCE_PATH = os.path.join(RESOURCE_PATH, "tfs", "tfs-test-entrypoint-with-handler")
INSTANCE_TYPE = "ml.c4.xlarge"
EFS_DIR_PATH = "/tensorflow"
FSX_DIR_PATH = "/fsx/tensorflow"
MAX_JOBS = 2
Expand All @@ -49,11 +47,7 @@ def efs_fsx_setup(sagemaker_session):
tear_down(sagemaker_session, fs_resources)


@pytest.mark.skipif(
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_mnist_efs(efs_fsx_setup, sagemaker_session):
def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
role = efs_fsx_setup.role_name
subnets = [efs_fsx_setup.subnet_id]
security_group_ids = efs_fsx_setup.security_group_ids
Expand All @@ -62,7 +56,7 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session):
entry_point=SCRIPT,
role=role,
train_instance_count=1,
train_instance_type=INSTANCE_TYPE,
train_instance_type=cpu_instance_type,
sagemaker_session=sagemaker_session,
script_mode=True,
framework_version=TensorFlow.LATEST_VERSION,
Expand All @@ -85,11 +79,7 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session):
)


@pytest.mark.skipif(
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_mnist_lustre(efs_fsx_setup, sagemaker_session):
def test_mnist_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
role = efs_fsx_setup.role_name
subnets = [efs_fsx_setup.subnet_id]
security_group_ids = efs_fsx_setup.security_group_ids
Expand All @@ -98,7 +88,7 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session):
entry_point=SCRIPT,
role=role,
train_instance_count=1,
train_instance_type=INSTANCE_TYPE,
train_instance_type=cpu_instance_type,
sagemaker_session=sagemaker_session,
script_mode=True,
framework_version=TensorFlow.LATEST_VERSION,
Expand All @@ -121,11 +111,7 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session):
)


@pytest.mark.skipif(
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session):
def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type):
role = efs_fsx_setup.role_name
subnets = [efs_fsx_setup.subnet_id]
security_group_ids = efs_fsx_setup.security_group_ids
Expand All @@ -134,7 +120,7 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session):
entry_point=SCRIPT,
role=role,
train_instance_count=1,
train_instance_type=INSTANCE_TYPE,
train_instance_type=cpu_instance_type,
script_mode=True,
sagemaker_session=sagemaker_session,
py_version=PY_VERSION,
Expand Down Expand Up @@ -169,11 +155,7 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session):
assert best_training_job


@pytest.mark.skipif(
tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION,
reason="EFS integration tests need to be fixed before running in all regions.",
)
def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session):
def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type):
role = efs_fsx_setup.role_name
subnets = [efs_fsx_setup.subnet_id]
security_group_ids = efs_fsx_setup.security_group_ids
Expand All @@ -182,7 +164,7 @@ def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session):
entry_point=SCRIPT,
role=role,
train_instance_count=1,
train_instance_type=INSTANCE_TYPE,
train_instance_type=cpu_instance_type,
script_mode=True,
sagemaker_session=sagemaker_session,
py_version=PY_VERSION,
Expand Down
12 changes: 5 additions & 7 deletions tests/integ/vpc_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def _route_table_id(ec2_client, vpc_id):
return desc["RouteTables"][0]["RouteTableId"]


def check_or_create_vpc_resources_efs_fsx(sagemaker_session, region, name=VPC_NAME):
def check_or_create_vpc_resources_efs_fsx(sagemaker_session, name=VPC_NAME):
# use lock to prevent race condition when tests are running concurrently
with lock.lock(LOCK_PATH):
ec2_client = sagemaker_session.boto_session.client("ec2")
Expand All @@ -74,13 +74,11 @@ def check_or_create_vpc_resources_efs_fsx(sagemaker_session, region, name=VPC_NA
_security_group_ids_by_vpc_id(sagemaker_session, vpc_id),
)
else:
return _create_vpc_with_name_efs_fsx(ec2_client, region, name)
return _create_vpc_with_name_efs_fsx(ec2_client, name)


def _create_vpc_with_name_efs_fsx(ec2_client, region, name):
vpc_id, [subnet_id_a, subnet_id_b], security_group_id = _create_vpc_resources(
ec2_client, region, name
)
def _create_vpc_with_name_efs_fsx(ec2_client, name):
vpc_id, [subnet_id_a, subnet_id_b], security_group_id = _create_vpc_resources(ec2_client, name)
ec2_client.modify_vpc_attribute(EnableDnsHostnames={"Value": True}, VpcId=vpc_id)

ig = ec2_client.create_internet_gateway()
Expand Down Expand Up @@ -121,7 +119,7 @@ def _create_vpc_with_name_efs_fsx(ec2_client, region, name):
return [subnet_id_a], [security_group_id]


def _create_vpc_resources(ec2_client, region, name):
def _create_vpc_resources(ec2_client, name):
vpc_id = ec2_client.create_vpc(CidrBlock="10.0.0.0/16")["Vpc"]["VpcId"]
print("created vpc: {}".format(vpc_id))

Expand Down
7 changes: 4 additions & 3 deletions tests/scripts/fs_mount_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,19 @@
# Mounting EFS and FSx for Lustre file systems for integration Tests
FILE_SYSTEM_EFS_ID=$1
FILE_SYSTEM_FSX_ID=$2
REGION=$3

echo "Mounting EFS File Systems"
sudo yum install -y amazon-efs-utils.noarch 0:1.10-1.amzn2
sudo yum install -y amazon-efs-utils
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do different regions have different versions released?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The list for the AMI ids are Amazon Linux AMI 2018, while the old hard-coded one is Amazon Linux2 AMI.
The new list needs amazon-efs-utils.noarch 0:1.10-1.amzn1 version. I just don't want specify specific version in case some of them need amazon-efs-utils.noarch 0:1.10-1.amzn2 version.
But it will automatically select the correct version if i don't specify

sudo mkdir efs
sudo mount -t efs "$FILE_SYSTEM_EFS_ID":/ efs
sudo mkdir efs/tensorflow
sudo mkdir efs/one_p_mnist

echo "Mounting FSx for Lustre File System"
sudo amazon-linux-extras install -y lustre2.10
sudo yum install -y lustre-client
sudo mkdir -p /mnt/fsx
sudo mount -t lustre -o noatime,flock "$FILE_SYSTEM_FSX_ID".fsx.us-west-2.amazonaws.com@tcp:/fsx /mnt/fsx
sudo mount -t lustre -o noatime,flock "$FILE_SYSTEM_FSX_ID".fsx."$REGION".amazonaws.com@tcp:/fsx /mnt/fsx
sudo mkdir /mnt/fsx/tensorflow
sudo mkdir /mnt/fsx/one_p_mnist

Expand Down