Skip to content

Commit a468d36

Browse files
authored
Merge pull request #2 from aws/master
pull updates
2 parents 356c5d1 + d77ae75 commit a468d36

File tree

10 files changed

+276
-42
lines changed

10 files changed

+276
-42
lines changed

CHANGELOG.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,18 @@
11
# Changelog
22

3+
## v1.31.0 (2019-06-27)
4+
5+
### Features
6+
7+
* use deep learning images
8+
9+
### Bug fixes and other changes
10+
11+
* Update buildspec.yml
12+
* allow only one integration test run per time
13+
* remove unnecessary P3 tests from TFS integration tests
14+
* add pytest.mark.local_mode annotation to broken tests
15+
316
## v1.30.0 (2019-06-25)
417

518
### Features

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.30.1.dev0
1+
1.31.1.dev0

buildspec.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ phases:
99
commands:
1010
# run linters
1111
- tox -e flake8,pylint
12-
1312
# run package and docbuild checks
1413
- tox -e twine
1514
- tox -e sphinx
@@ -34,7 +33,14 @@ phases:
3433
# run integration tests
3534
- |
3635
if has-matching-changes "tests/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"; then
37-
IGNORE_COVERAGE=- tox -e py36,py27 -- tests/integ -n 24 --boxed --reruns 2
36+
python3 -u ci-scripts/queue_build.py
37+
IGNORE_COVERAGE=- tox -e py36,py27 -- tests/integ -n 24 --reruns 3
3838
else
3939
echo "skipping integration tests"
4040
fi
41+
post_build:
42+
finally:
43+
- FILENAME=$(ls ci-lock/)
44+
- ACCOUNT=$(aws sts get-caller-identity --output text | awk '{print $1}')
45+
- S3_BUCKET_DIR=s3://sagemaker-us-west-2-${ACCOUNT}/ci-lock/
46+
- aws s3 rm ${S3_BUCKET_DIR}${FILENAME}

ci-scripts/queue_build.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
15+
import os
16+
import time
17+
import boto3
18+
19+
account = boto3.client("sts").get_caller_identity()["Account"]
20+
bucket_name = "sagemaker-us-west-2-%s" % account
21+
22+
23+
def queue_build():
24+
build_id = os.environ.get("CODEBUILD_BUILD_ID", "CODEBUILD-BUILD-ID")
25+
source_version = os.environ.get("CODEBUILD_SOURCE_VERSION", "CODEBUILD-SOURCE-VERSION").replace(
26+
"/", "-"
27+
)
28+
ticket_number = int(1000 * time.time())
29+
filename = "%s_%s_%s" % (ticket_number, build_id, source_version)
30+
31+
print("Created queue ticket %s" % ticket_number)
32+
33+
_write_ticket(filename)
34+
files = _list_tickets()
35+
_cleanup_tickets_older_than_8_hours(files)
36+
_wait_for_other_builds(files, ticket_number)
37+
38+
39+
def _build_info_from_file(file):
40+
filename = file.key.split("/")[1]
41+
ticket_number, build_id, source_version = filename.split("_")
42+
return int(ticket_number), build_id, source_version
43+
44+
45+
def _wait_for_other_builds(files, ticket_number):
46+
newfiles = list(filter(lambda file: not _file_older_than(file), files))
47+
sorted_files = list(sorted(newfiles, key=lambda y: y.key))
48+
49+
print("build queue status:")
50+
print()
51+
52+
for order, file in enumerate(sorted_files):
53+
file_ticket_number, build_id, source_version = _build_info_from_file(file)
54+
print(
55+
"%s -> %s %s, ticket number: %s" % (order, build_id, source_version, file_ticket_number)
56+
)
57+
58+
for file in sorted_files:
59+
file_ticket_number, build_id, source_version = _build_info_from_file(file)
60+
61+
if file_ticket_number == ticket_number:
62+
63+
break
64+
else:
65+
while True:
66+
client = boto3.client("codebuild")
67+
response = client.batch_get_builds(ids=[build_id])
68+
build_status = response["builds"][0]["buildStatus"]
69+
70+
if build_status == "IN_PROGRESS":
71+
print(
72+
"waiting on build %s %s %s" % (build_id, source_version, file_ticket_number)
73+
)
74+
time.sleep(30)
75+
else:
76+
print("build %s finished, deleting lock" % build_id)
77+
file.delete()
78+
break
79+
80+
81+
def _cleanup_tickets_older_than_8_hours(files):
82+
oldfiles = list(filter(_file_older_than, files))
83+
for file in oldfiles:
84+
print("object %s older than 8 hours. Deleting" % file.key)
85+
file.delete()
86+
return files
87+
88+
89+
def _list_tickets():
90+
s3 = boto3.resource("s3")
91+
bucket = s3.Bucket(bucket_name)
92+
objects = [file for file in bucket.objects.filter(Prefix="ci-lock/")]
93+
files = list(filter(lambda x: x != "ci-lock/", objects))
94+
return files
95+
96+
97+
def _file_older_than(file):
98+
timelimit = 1000 * 60 * 60 * 8
99+
100+
file_ticket_number, build_id, source_version = _build_info_from_file(file)
101+
102+
return int(time.time()) - file_ticket_number > timelimit
103+
104+
105+
def _write_ticket(ticket_number):
106+
107+
if not os.path.exists("ci-lock"):
108+
os.mkdir("ci-lock")
109+
110+
filename = "ci-lock/" + ticket_number
111+
with open(filename, "w") as file:
112+
file.write(ticket_number)
113+
boto3.Session().resource("s3").Object(bucket_name, filename).upload_file(filename)
114+
115+
116+
if __name__ == "__main__":
117+
queue_build()

src/sagemaker/fw_utils.py

Lines changed: 68 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,56 @@
5454
VALID_EIA_FRAMEWORKS = ["tensorflow", "tensorflow-serving", "mxnet", "mxnet-serving"]
5555
VALID_ACCOUNTS_BY_REGION = {"us-gov-west-1": "246785580436", "us-iso-east-1": "744548109606"}
5656

57+
MERGED_FRAMEWORKS_REPO_MAP = {
58+
"tensorflow-scriptmode": "tensorflow-training",
59+
"mxnet": "mxnet-training",
60+
"tensorflow-serving": "tensorflow-inference",
61+
"mxnet-serving": "mxnet-inference",
62+
}
63+
64+
MERGED_FRAMEWORKS_LOWEST_VERSIONS = {
65+
"tensorflow-scriptmode": [1, 13, 1],
66+
"mxnet": [1, 4, 1],
67+
"tensorflow-serving": [1, 13, 0],
68+
"mxnet-serving": [1, 4, 1],
69+
}
70+
71+
72+
def is_version_equal_or_higher(lowest_version, framework_version):
73+
"""Determine whether the ``framework_version`` is equal to or higher than ``lowest_version``
74+
75+
Args:
76+
lowest_version (List[int]): lowest version represented in an integer list
77+
framework_version (str): framework version string
78+
79+
Returns:
80+
bool: Whether or not framework_version is equal to or higher than lowest_version
81+
"""
82+
version_list = [int(s) for s in framework_version.split(".")]
83+
return version_list >= lowest_version[0 : len(version_list)]
84+
85+
86+
def _is_merged_versions(framework, framework_version):
87+
lowest_version_list = MERGED_FRAMEWORKS_LOWEST_VERSIONS.get(framework)
88+
if lowest_version_list:
89+
return is_version_equal_or_higher(lowest_version_list, framework_version)
90+
else:
91+
return False
92+
93+
94+
def _using_merged_images(region, framework, py_version, accelerator_type, framework_version):
95+
is_gov_region = region in VALID_ACCOUNTS_BY_REGION
96+
is_py3 = py_version == "py3" or py_version is None
97+
is_merged_versions = _is_merged_versions(framework, framework_version)
98+
return (not is_gov_region) and is_merged_versions and is_py3 and accelerator_type is None
99+
100+
101+
def _registry_id(region, framework, py_version, account, accelerator_type, framework_version):
102+
if _using_merged_images(region, framework, py_version, accelerator_type, framework_version):
103+
return "763104351884"
104+
else:
105+
return VALID_ACCOUNTS_BY_REGION.get(region, account)
106+
57107

58108
def create_image_uri(
59109
region,
@@ -86,8 +136,15 @@ def create_image_uri(
86136
if py_version and py_version not in VALID_PY_VERSIONS:
87137
raise ValueError("invalid py_version argument: {}".format(py_version))
88138

89-
# Handle Account Number for Gov Cloud
90-
account = VALID_ACCOUNTS_BY_REGION.get(region, account)
139+
# Handle Account Number for Gov Cloud and frameworks with DLC merged images
140+
account = _registry_id(
141+
region=region,
142+
framework=framework,
143+
py_version=py_version,
144+
account=account,
145+
accelerator_type=accelerator_type,
146+
framework_version=framework_version,
147+
)
91148

92149
# Handle Local Mode
93150
if instance_type.startswith("local"):
@@ -121,7 +178,14 @@ def create_image_uri(
121178
):
122179
framework += "-eia"
123180

124-
return "{}/sagemaker-{}:{}".format(get_ecr_image_uri_prefix(account, region), framework, tag)
181+
if _using_merged_images(region, framework, py_version, accelerator_type, framework_version):
182+
return "{}/{}:{}".format(
183+
get_ecr_image_uri_prefix(account, region), MERGED_FRAMEWORKS_REPO_MAP[framework], tag
184+
)
185+
else:
186+
return "{}/sagemaker-{}:{}".format(
187+
get_ecr_image_uri_prefix(account, region), framework, tag
188+
)
125189

126190

127191
def _accelerator_type_valid_for_framework(
@@ -264,7 +328,7 @@ def framework_name_from_image(image_name):
264328
# extract framework, python version and image tag
265329
# We must support both the legacy and current image name format.
266330
name_pattern = re.compile(
267-
r"^sagemaker(?:-rl)?-(tensorflow|mxnet|chainer|pytorch|scikit-learn)(?:-)?(scriptmode)?:(.*)-(.*?)-(py2|py3)$" # noqa: E501
331+
r"^(?:sagemaker(?:-rl)?-)?(tensorflow|mxnet|chainer|pytorch|scikit-learn)(?:-)?(scriptmode|training)?:(.*)-(.*?)-(py2|py3)$" # noqa: E501
268332
)
269333
legacy_name_pattern = re.compile(r"^sagemaker-(tensorflow|mxnet)-(py2|py3)-(cpu|gpu):(.*)$")
270334

tests/integ/test_git.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import os
1616

1717
import numpy
18+
import pytest
1819
import tempfile
1920

2021
from tests.integ import lock as lock
@@ -30,6 +31,7 @@
3031
LOCK_PATH = os.path.join(tempfile.gettempdir(), "sagemaker_test_git_lock")
3132

3233

34+
@pytest.mark.local_mode
3335
def test_git_support_with_pytorch(sagemaker_local_session):
3436
script_path = "mnist.py"
3537
data_path = os.path.join(DATA_DIR, "pytorch_mnist")
@@ -59,6 +61,7 @@ def test_git_support_with_pytorch(sagemaker_local_session):
5961
predictor.delete_endpoint()
6062

6163

64+
@pytest.mark.local_mode
6265
def test_git_support_with_mxnet(sagemaker_local_session, mxnet_full_version):
6366
script_path = "mnist.py"
6467
data_path = os.path.join(DATA_DIR, "mxnet_mnist")

tests/integ/test_tf_script_mode.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ def test_mnist(sagemaker_session, instance_type):
6565
sagemaker_session=sagemaker_session,
6666
script_mode=True,
6767
framework_version=TensorFlow.LATEST_VERSION,
68+
py_version=tests.integ.PYTHON_VERSION,
6869
metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}],
6970
)
7071
inputs = estimator.sagemaker_session.upload_data(
@@ -98,6 +99,7 @@ def test_server_side_encryption(sagemaker_session):
9899
sagemaker_session=sagemaker_session,
99100
script_mode=True,
100101
framework_version=TensorFlow.LATEST_VERSION,
102+
py_version=tests.integ.PYTHON_VERSION,
101103
code_location=output_path,
102104
output_path=output_path,
103105
model_dir="/opt/ml/model",
@@ -144,6 +146,7 @@ def test_mnist_async(sagemaker_session):
144146
role=ROLE,
145147
train_instance_count=1,
146148
train_instance_type="ml.c5.4xlarge",
149+
py_version=tests.integ.PYTHON_VERSION,
147150
sagemaker_session=sagemaker_session,
148151
script_mode=True,
149152
framework_version=TensorFlow.LATEST_VERSION,
@@ -182,6 +185,7 @@ def test_deploy_with_input_handlers(sagemaker_session, instance_type):
182185
role=ROLE,
183186
train_instance_count=1,
184187
train_instance_type=instance_type,
188+
py_version=tests.integ.PYTHON_VERSION,
185189
sagemaker_session=sagemaker_session,
186190
script_mode=True,
187191
framework_version=TensorFlow.LATEST_VERSION,

tests/integ/test_tfs.py

Lines changed: 4 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -26,25 +26,8 @@
2626
from sagemaker.tensorflow.serving import Model, Predictor
2727

2828

29-
@pytest.fixture(
30-
scope="session",
31-
params=[
32-
"ml.c5.xlarge",
33-
pytest.param(
34-
"ml.p3.2xlarge",
35-
marks=pytest.mark.skipif(
36-
tests.integ.test_region() in tests.integ.HOSTING_NO_P3_REGIONS,
37-
reason="no ml.p3 instances in this region",
38-
),
39-
),
40-
],
41-
)
42-
def instance_type(request):
43-
return request.param
44-
45-
4629
@pytest.fixture(scope="module")
47-
def tfs_predictor(instance_type, sagemaker_session, tf_full_version):
30+
def tfs_predictor(sagemaker_session, tf_full_version):
4831
endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-tensorflow-serving")
4932
model_data = sagemaker_session.upload_data(
5033
path=os.path.join(tests.integ.DATA_DIR, "tensorflow-serving-test-model.tar.gz"),
@@ -57,7 +40,7 @@ def tfs_predictor(instance_type, sagemaker_session, tf_full_version):
5740
framework_version=tf_full_version,
5841
sagemaker_session=sagemaker_session,
5942
)
60-
predictor = model.deploy(1, instance_type, endpoint_name=endpoint_name)
43+
predictor = model.deploy(1, "ml.c5.xlarge", endpoint_name=endpoint_name)
6144
yield predictor
6245

6346

@@ -130,8 +113,6 @@ def tfs_predictor_with_model_and_entry_point_and_dependencies(
130113
@pytest.fixture(scope="module")
131114
def tfs_predictor_with_accelerator(sagemaker_session, tf_full_version):
132115
endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-tensorflow-serving")
133-
instance_type = "ml.c4.large"
134-
accelerator_type = "ml.eia1.medium"
135116
model_data = sagemaker_session.upload_data(
136117
path=os.path.join(tests.integ.DATA_DIR, "tensorflow-serving-test-model.tar.gz"),
137118
key_prefix="tensorflow-serving/models",
@@ -144,13 +125,13 @@ def tfs_predictor_with_accelerator(sagemaker_session, tf_full_version):
144125
sagemaker_session=sagemaker_session,
145126
)
146127
predictor = model.deploy(
147-
1, instance_type, endpoint_name=endpoint_name, accelerator_type=accelerator_type
128+
1, "ml.c4.large", endpoint_name=endpoint_name, accelerator_type="ml.eia1.medium"
148129
)
149130
yield predictor
150131

151132

152133
@pytest.mark.canary_quick
153-
def test_predict(tfs_predictor, instance_type): # pylint: disable=W0613
134+
def test_predict(tfs_predictor): # pylint: disable=W0613
154135
input_data = {"instances": [1.0, 2.0, 5.0]}
155136
expected_result = {"predictions": [3.5, 4.0, 5.5]}
156137

0 commit comments

Comments
 (0)