Skip to content

Commit 79ef51e

Browse files
authored
Merge branch 'master' into refactor-tensorflow-general
2 parents 94d271e + fe29f60 commit 79ef51e

File tree

12 files changed

+313
-80
lines changed

12 files changed

+313
-80
lines changed

CHANGELOG.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,18 @@
11
# Changelog
22

3+
## v1.31.0 (2019-06-27)
4+
5+
### Features
6+
7+
* use deep learning images
8+
9+
### Bug fixes and other changes
10+
11+
* Update buildspec.yml
12+
* allow only one integration test run per time
13+
* remove unnecessary P3 tests from TFS integration tests
14+
* add pytest.mark.local_mode annotation to broken tests
15+
316
## v1.30.0 (2019-06-25)
417

518
### Features

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.30.1.dev0
1+
1.31.1.dev0

buildspec.yml

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,35 @@ phases:
88
build:
99
commands:
1010
# run linters
11-
- tox -e flake8,pylint
12-
13-
# run package and docbuild checks
14-
- tox -e twine
11+
- TOX_PARALLEL_NO_SPINNER=1
12+
- PY_COLORS=0
13+
- tox -e flake8,pylint,twine,black-check --parallel all
1514
- tox -e sphinx
1615

17-
# run format verification
18-
- tox -e black-check
19-
2016
# run unit tests
2117
- AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN=
2218
AWS_CONTAINER_CREDENTIALS_RELATIVE_URI= AWS_DEFAULT_REGION=
23-
tox -e py36,py27 -- tests/unit
19+
tox -e py36,py27 --parallel all -- tests/unit
2420

21+
# local mode tests
22+
- |
23+
if has-matching-changes "tests/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"; then
24+
IGNORE_COVERAGE=- tox -e py36 -- tests/integ -m local_mode --durations 50
25+
IGNORE_COVERAGE=- tox -e py27 -- tests/integ -m local_mode --durations 50
26+
else
27+
echo "skipping integration tests"
28+
fi
29+
30+
# run integration tests
31+
- |
32+
if has-matching-changes "tests/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"; then
33+
python3 -u ci-scripts/queue_build.py
34+
IGNORE_COVERAGE=- tox -e py36 -- tests/integ -m "not local_mode" -n 48 --reruns 3 --reruns-delay 5 --durations 50
35+
IGNORE_COVERAGE=- tox -e py27 -- tests/integ -m "not local_mode" -n 48 --reruns 3 --reruns-delay 5 --durations 50
36+
else
37+
echo "skipping integration tests"
38+
fi
39+
2540
# run notebook test
2641
- |
2742
if has-matching-changes "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"; then
@@ -30,11 +45,9 @@ phases:
3045
else
3146
echo "skipping notebook test"
3247
fi
33-
34-
# run integration tests
35-
- |
36-
if has-matching-changes "tests/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"; then
37-
IGNORE_COVERAGE=- tox -e py36,py27 -- tests/integ -n 24 --boxed --reruns 2
38-
else
39-
echo "skipping integration tests"
40-
fi
48+
post_build:
49+
finally:
50+
- FILENAME=$(ls ci-lock/)
51+
- ACCOUNT=$(aws sts get-caller-identity --output text | awk '{print $1}')
52+
- S3_BUCKET_DIR=s3://sagemaker-us-west-2-${ACCOUNT}/ci-lock/
53+
- aws s3 rm ${S3_BUCKET_DIR}${FILENAME}

ci-scripts/queue_build.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
15+
import os
16+
import time
17+
import boto3
18+
19+
account = boto3.client("sts").get_caller_identity()["Account"]
20+
bucket_name = "sagemaker-us-west-2-%s" % account
21+
22+
23+
def queue_build():
24+
build_id = os.environ.get("CODEBUILD_BUILD_ID", "CODEBUILD-BUILD-ID")
25+
source_version = os.environ.get("CODEBUILD_SOURCE_VERSION", "CODEBUILD-SOURCE-VERSION").replace(
26+
"/", "-"
27+
)
28+
ticket_number = int(1000 * time.time())
29+
filename = "%s_%s_%s" % (ticket_number, build_id, source_version)
30+
31+
print("Created queue ticket %s" % ticket_number)
32+
33+
_write_ticket(filename)
34+
files = _list_tickets()
35+
_cleanup_tickets_older_than_8_hours(files)
36+
_wait_for_other_builds(files, ticket_number)
37+
38+
39+
def _build_info_from_file(file):
40+
filename = file.key.split("/")[1]
41+
ticket_number, build_id, source_version = filename.split("_")
42+
return int(ticket_number), build_id, source_version
43+
44+
45+
def _wait_for_other_builds(files, ticket_number):
46+
newfiles = list(filter(lambda file: not _file_older_than(file), files))
47+
sorted_files = list(sorted(newfiles, key=lambda y: y.key))
48+
49+
print("build queue status:")
50+
print()
51+
52+
for order, file in enumerate(sorted_files):
53+
file_ticket_number, build_id, source_version = _build_info_from_file(file)
54+
print(
55+
"%s -> %s %s, ticket number: %s" % (order, build_id, source_version, file_ticket_number)
56+
)
57+
58+
for file in sorted_files:
59+
file_ticket_number, build_id, source_version = _build_info_from_file(file)
60+
61+
if file_ticket_number == ticket_number:
62+
63+
break
64+
else:
65+
while True:
66+
client = boto3.client("codebuild")
67+
response = client.batch_get_builds(ids=[build_id])
68+
build_status = response["builds"][0]["buildStatus"]
69+
70+
if build_status == "IN_PROGRESS":
71+
print(
72+
"waiting on build %s %s %s" % (build_id, source_version, file_ticket_number)
73+
)
74+
time.sleep(30)
75+
else:
76+
print("build %s finished, deleting lock" % build_id)
77+
file.delete()
78+
break
79+
80+
81+
def _cleanup_tickets_older_than_8_hours(files):
82+
oldfiles = list(filter(_file_older_than, files))
83+
for file in oldfiles:
84+
print("object %s older than 8 hours. Deleting" % file.key)
85+
file.delete()
86+
return files
87+
88+
89+
def _list_tickets():
90+
s3 = boto3.resource("s3")
91+
bucket = s3.Bucket(bucket_name)
92+
objects = [file for file in bucket.objects.filter(Prefix="ci-lock/")]
93+
files = list(filter(lambda x: x != "ci-lock/", objects))
94+
return files
95+
96+
97+
def _file_older_than(file):
98+
timelimit = 1000 * 60 * 60 * 8
99+
100+
file_ticket_number, build_id, source_version = _build_info_from_file(file)
101+
102+
return int(time.time()) - file_ticket_number > timelimit
103+
104+
105+
def _write_ticket(ticket_number):
106+
107+
if not os.path.exists("ci-lock"):
108+
os.mkdir("ci-lock")
109+
110+
filename = "ci-lock/" + ticket_number
111+
with open(filename, "w") as file:
112+
file.write(ticket_number)
113+
boto3.Session().resource("s3").Object(bucket_name, filename).upload_file(filename)
114+
115+
116+
if __name__ == "__main__":
117+
queue_build()

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def read_version():
6767
install_requires=required_packages,
6868
extras_require={
6969
"test": [
70-
"tox",
70+
"tox==3.13.1",
7171
"flake8",
7272
"pytest==4.4.1",
7373
"pytest-cov",
@@ -79,6 +79,7 @@ def read_version():
7979
"awslogs",
8080
"pandas",
8181
"black==19.3b0 ; python_version >= '3.6'",
82+
"stopit==1.1.2",
8283
]
8384
},
8485
entry_points={"console_scripts": ["sagemaker=sagemaker.cli.main:main"]},

src/sagemaker/fw_utils.py

Lines changed: 68 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,56 @@
5454
VALID_EIA_FRAMEWORKS = ["tensorflow", "tensorflow-serving", "mxnet", "mxnet-serving"]
5555
VALID_ACCOUNTS_BY_REGION = {"us-gov-west-1": "246785580436", "us-iso-east-1": "744548109606"}
5656

57+
MERGED_FRAMEWORKS_REPO_MAP = {
58+
"tensorflow-scriptmode": "tensorflow-training",
59+
"mxnet": "mxnet-training",
60+
"tensorflow-serving": "tensorflow-inference",
61+
"mxnet-serving": "mxnet-inference",
62+
}
63+
64+
MERGED_FRAMEWORKS_LOWEST_VERSIONS = {
65+
"tensorflow-scriptmode": [1, 13, 1],
66+
"mxnet": [1, 4, 1],
67+
"tensorflow-serving": [1, 13, 0],
68+
"mxnet-serving": [1, 4, 1],
69+
}
70+
71+
72+
def is_version_equal_or_higher(lowest_version, framework_version):
73+
"""Determine whether the ``framework_version`` is equal to or higher than ``lowest_version``
74+
75+
Args:
76+
lowest_version (List[int]): lowest version represented in an integer list
77+
framework_version (str): framework version string
78+
79+
Returns:
80+
bool: Whether or not framework_version is equal to or higher than lowest_version
81+
"""
82+
version_list = [int(s) for s in framework_version.split(".")]
83+
return version_list >= lowest_version[0 : len(version_list)]
84+
85+
86+
def _is_merged_versions(framework, framework_version):
87+
lowest_version_list = MERGED_FRAMEWORKS_LOWEST_VERSIONS.get(framework)
88+
if lowest_version_list:
89+
return is_version_equal_or_higher(lowest_version_list, framework_version)
90+
else:
91+
return False
92+
93+
94+
def _using_merged_images(region, framework, py_version, accelerator_type, framework_version):
95+
is_gov_region = region in VALID_ACCOUNTS_BY_REGION
96+
is_py3 = py_version == "py3" or py_version is None
97+
is_merged_versions = _is_merged_versions(framework, framework_version)
98+
return (not is_gov_region) and is_merged_versions and is_py3 and accelerator_type is None
99+
100+
101+
def _registry_id(region, framework, py_version, account, accelerator_type, framework_version):
102+
if _using_merged_images(region, framework, py_version, accelerator_type, framework_version):
103+
return "763104351884"
104+
else:
105+
return VALID_ACCOUNTS_BY_REGION.get(region, account)
106+
57107

58108
def create_image_uri(
59109
region,
@@ -86,8 +136,15 @@ def create_image_uri(
86136
if py_version and py_version not in VALID_PY_VERSIONS:
87137
raise ValueError("invalid py_version argument: {}".format(py_version))
88138

89-
# Handle Account Number for Gov Cloud
90-
account = VALID_ACCOUNTS_BY_REGION.get(region, account)
139+
# Handle Account Number for Gov Cloud and frameworks with DLC merged images
140+
account = _registry_id(
141+
region=region,
142+
framework=framework,
143+
py_version=py_version,
144+
account=account,
145+
accelerator_type=accelerator_type,
146+
framework_version=framework_version,
147+
)
91148

92149
# Handle Local Mode
93150
if instance_type.startswith("local"):
@@ -121,7 +178,14 @@ def create_image_uri(
121178
):
122179
framework += "-eia"
123180

124-
return "{}/sagemaker-{}:{}".format(get_ecr_image_uri_prefix(account, region), framework, tag)
181+
if _using_merged_images(region, framework, py_version, accelerator_type, framework_version):
182+
return "{}/{}:{}".format(
183+
get_ecr_image_uri_prefix(account, region), MERGED_FRAMEWORKS_REPO_MAP[framework], tag
184+
)
185+
else:
186+
return "{}/sagemaker-{}:{}".format(
187+
get_ecr_image_uri_prefix(account, region), framework, tag
188+
)
125189

126190

127191
def _accelerator_type_valid_for_framework(
@@ -264,7 +328,7 @@ def framework_name_from_image(image_name):
264328
# extract framework, python version and image tag
265329
# We must support both the legacy and current image name format.
266330
name_pattern = re.compile(
267-
r"^sagemaker(?:-rl)?-(tensorflow|mxnet|chainer|pytorch|scikit-learn)(?:-)?(scriptmode)?:(.*)-(.*?)-(py2|py3)$" # noqa: E501
331+
r"^(?:sagemaker(?:-rl)?-)?(tensorflow|mxnet|chainer|pytorch|scikit-learn)(?:-)?(scriptmode|training)?:(.*)-(.*?)-(py2|py3)$" # noqa: E501
268332
)
269333
legacy_name_pattern = re.compile(r"^sagemaker-(tensorflow|mxnet)-(py2|py3)-(cpu|gpu):(.*)$")
270334

tests/integ/test_local_mode.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,10 @@
2020
import pytest
2121
import tempfile
2222

23+
import stopit
24+
2325
import tests.integ.lock as lock
2426
from tests.integ import DATA_DIR, PYTHON_VERSION
25-
from tests.integ.timeout import timeout
2627

2728
from sagemaker.local import LocalSession, LocalSagemakerRuntimeClient, LocalSagemakerClient
2829
from sagemaker.mxnet import MXNet
@@ -86,7 +87,7 @@ def _create_model(output_path):
8687
@pytest.mark.local_mode
8788
@pytest.mark.skipif(PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2.")
8889
def test_tf_local_mode(sagemaker_local_session):
89-
with timeout(minutes=5):
90+
with stopit.ThreadingTimeout(5 * 60, swallow_exc=False):
9091
script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py")
9192

9293
estimator = TensorFlow(
@@ -129,7 +130,7 @@ def test_tf_local_mode(sagemaker_local_session):
129130
@pytest.mark.local_mode
130131
@pytest.mark.skipif(PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2.")
131132
def test_tf_distributed_local_mode(sagemaker_local_session):
132-
with timeout(minutes=5):
133+
with stopit.ThreadingTimeout(5 * 60, swallow_exc=False):
133134
script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py")
134135

135136
estimator = TensorFlow(
@@ -171,7 +172,7 @@ def test_tf_distributed_local_mode(sagemaker_local_session):
171172
@pytest.mark.local_mode
172173
@pytest.mark.skipif(PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2.")
173174
def test_tf_local_data(sagemaker_local_session):
174-
with timeout(minutes=5):
175+
with stopit.ThreadingTimeout(5 * 60, swallow_exc=False):
175176
script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py")
176177

177178
estimator = TensorFlow(
@@ -212,7 +213,7 @@ def test_tf_local_data(sagemaker_local_session):
212213
@pytest.mark.local_mode
213214
@pytest.mark.skipif(PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2.")
214215
def test_tf_local_data_local_script():
215-
with timeout(minutes=5):
216+
with stopit.ThreadingTimeout(5 * 60, swallow_exc=False):
216217
script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py")
217218

218219
estimator = TensorFlow(
@@ -391,7 +392,7 @@ def test_local_transform_mxnet(sagemaker_local_session, tmpdir, mxnet_full_versi
391392
path=os.path.join(data_path, "test"), key_prefix="integ-test-data/mxnet_mnist/test"
392393
)
393394

394-
with timeout(minutes=15):
395+
with stopit.ThreadingTimeout(5 * 60, swallow_exc=False):
395396
mx.fit({"train": train_input, "test": test_input})
396397

397398
transform_input_path = os.path.join(data_path, "transform")

0 commit comments

Comments
 (0)