Skip to content

Commit 7fa3679

Browse files
authored
Add SageMaker hosting integ test (#18)
1 parent ee43b1f commit 7fa3679

File tree

6 files changed

+197
-22
lines changed

6 files changed

+197
-22
lines changed

setup.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,10 @@ def read(fname):
4646
'Programming Language :: Python :: 3.6',
4747
],
4848

49-
# We don't declare our dependency on mxnet here because we build with
50-
# different packages for different variants (e.g. mxnet-mkl and mxnet-cu90).
5149
install_requires=['sagemaker-inference==1.0.0'],
5250
extras_require={
53-
'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock', 'sagemaker',
54-
'docker-compose', 'mxnet==1.4.0']
51+
'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock',
52+
'sagemaker==1.23.0', 'docker-compose', 'mxnet==1.4.0', 'awslogs']
5553
},
5654

5755
entry_points={

test/conftest.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
1+
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
22
#
3-
# Licensed under the Apache License, Version 2.0 (the "License").
4-
# You may not use this file except in compliance with the License.
5-
# A copy of the License is located at
3+
# Licensed under the Apache License, Version 2.0 (the "License").
4+
# You may not use this file except in compliance with the License.
5+
# A copy of the License is located at
66
#
7-
# http://www.apache.org/licenses/LICENSE-2.0
7+
# http://www.apache.org/licenses/LICENSE-2.0
88
#
9-
# or in the "license" file accompanying this file. This file is distributed
10-
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11-
# express or implied. See the License for the specific language governing
12-
# permissions and limitations under the License.
9+
# or in the "license" file accompanying this file. This file is distributed
10+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11+
# express or implied. See the License for the specific language governing
12+
# permissions and limitations under the License.
1313
from __future__ import absolute_import
1414

1515
import logging

test/integration/local/test_hosting.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
1+
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
22
#
3-
# Licensed under the Apache License, Version 2.0 (the "License").
4-
# You may not use this file except in compliance with the License.
5-
# A copy of the License is located at
3+
# Licensed under the Apache License, Version 2.0 (the "License").
4+
# You may not use this file except in compliance with the License.
5+
# A copy of the License is located at
66
#
7-
# http://www.apache.org/licenses/LICENSE-2.0
7+
# http://www.apache.org/licenses/LICENSE-2.0
88
#
9-
# or in the "license" file accompanying this file. This file is distributed
10-
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11-
# express or implied. See the License for the specific language governing
12-
# permissions and limitations under the License.
9+
# or in the "license" file accompanying this file. This file is distributed
10+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11+
# express or implied. See the License for the specific language governing
12+
# permissions and limitations under the License.
1313
from __future__ import absolute_import
1414

1515
import json
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License").
4+
# You may not use this file except in compliance with the License.
5+
# A copy of the License is located at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# or in the "license" file accompanying this file. This file is distributed
10+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
11+
# express or implied. See the License for the specific language governing
12+
# permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
15+
import os
16+
17+
from sagemaker import utils
18+
from sagemaker.mxnet.model import MXNetModel
19+
20+
from test.integration import RESOURCE_PATH
21+
import timeout
22+
23+
DEFAULT_HANDLER_PATH = os.path.join(RESOURCE_PATH, 'default_handlers')
24+
MODEL_PATH = os.path.join(DEFAULT_HANDLER_PATH, 'model.tar.gz')
25+
SCRIPT_PATH = os.path.join(DEFAULT_HANDLER_PATH, 'model', 'code', 'empty_module.py')
26+
27+
28+
def test_hosting(sagemaker_session, ecr_image, instance_type):
29+
prefix = 'mxnet-serving/default-handlers'
30+
model_data = sagemaker_session.upload_data(path=MODEL_PATH, key_prefix=prefix)
31+
model = MXNetModel(model_data,
32+
'SageMakerRole',
33+
SCRIPT_PATH,
34+
image=ecr_image,
35+
sagemaker_session=sagemaker_session)
36+
37+
endpoint_name = utils.unique_name_from_base('test-mxnet-serving')
38+
with timeout.timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
39+
predictor = model.deploy(1, instance_type, endpoint_name=endpoint_name)
40+
41+
output = predictor.predict([[1, 2]])
42+
assert [[4.9999918937683105]] == output

test/integration/sagemaker/timeout.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
15+
from contextlib import contextmanager
16+
import logging
17+
import signal
18+
from time import sleep
19+
20+
from awslogs.core import AWSLogs
21+
from botocore.exceptions import ClientError
22+
23+
LOGGER = logging.getLogger('timeout')
24+
25+
26+
class TimeoutError(Exception):
27+
pass
28+
29+
30+
@contextmanager
31+
def timeout(seconds=0, minutes=0, hours=0):
32+
"""
33+
Add a signal-based timeout to any block of code.
34+
If multiple time units are specified, they will be added together to determine time limit.
35+
Usage:
36+
with timeout(seconds=5):
37+
my_slow_function(...)
38+
Args:
39+
- seconds: The time limit, in seconds.
40+
- minutes: The time limit, in minutes.
41+
- hours: The time limit, in hours.
42+
"""
43+
44+
limit = seconds + 60 * minutes + 3600 * hours
45+
46+
def handler(signum, frame):
47+
raise TimeoutError('timed out after {} seconds'.format(limit))
48+
49+
try:
50+
signal.signal(signal.SIGALRM, handler)
51+
signal.alarm(limit)
52+
53+
yield
54+
finally:
55+
signal.alarm(0)
56+
57+
58+
@contextmanager
59+
def timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, seconds=0, minutes=45, hours=0):
60+
with timeout(seconds=seconds, minutes=minutes, hours=hours) as t:
61+
no_errors = False
62+
try:
63+
yield [t]
64+
no_errors = True
65+
finally:
66+
attempts = 3
67+
68+
while attempts > 0:
69+
attempts -= 1
70+
try:
71+
sagemaker_session.delete_endpoint(endpoint_name)
72+
LOGGER.info('deleted endpoint {}'.format(endpoint_name))
73+
74+
_show_logs(endpoint_name, 'Endpoints', sagemaker_session)
75+
if no_errors:
76+
_cleanup_logs(endpoint_name, 'Endpoints', sagemaker_session)
77+
break
78+
except ClientError as ce:
79+
if ce.response['Error']['Code'] == 'ValidationException':
80+
# avoids the inner exception to be overwritten
81+
pass
82+
# trying to delete the resource again in 10 seconds
83+
sleep(10)
84+
85+
86+
@contextmanager
87+
def timeout_and_delete_model_with_transformer(transformer, sagemaker_session, seconds=0, minutes=0, hours=0):
88+
with timeout(seconds=seconds, minutes=minutes, hours=hours) as t:
89+
no_errors = False
90+
try:
91+
yield [t]
92+
no_errors = True
93+
finally:
94+
attempts = 3
95+
96+
while attempts > 0:
97+
attempts -= 1
98+
try:
99+
transformer.delete_model()
100+
LOGGER.info('deleted SageMaker model {}'.format(transformer.model_name))
101+
102+
_show_logs(transformer.model_name, 'Models', sagemaker_session)
103+
if no_errors:
104+
_cleanup_logs(transformer.model_name, 'Models', sagemaker_session)
105+
break
106+
except ClientError as ce:
107+
if ce.response['Error']['Code'] == 'ValidationException':
108+
pass
109+
sleep(10)
110+
111+
112+
def _show_logs(resource_name, resource_type, sagemaker_session):
113+
log_group = '/aws/sagemaker/{}/{}'.format(resource_type, resource_name)
114+
try:
115+
# print out logs before deletion for debuggability
116+
LOGGER.info('cloudwatch logs for log group {}:'.format(log_group))
117+
logs = AWSLogs(log_group_name=log_group, log_stream_name='ALL', start='1d',
118+
aws_region=sagemaker_session.boto_session.region_name)
119+
logs.list_logs()
120+
except Exception:
121+
LOGGER.exception('Failure occurred while listing cloudwatch log group %s. Swallowing exception but printing '
122+
'stacktrace for debugging.', log_group)
123+
124+
125+
def _cleanup_logs(resource_name, resource_type, sagemaker_session):
126+
log_group = '/aws/sagemaker/{}/{}'.format(resource_type, resource_name)
127+
try:
128+
# print out logs before deletion for debuggability
129+
LOGGER.info('deleting cloudwatch log group {}:'.format(log_group))
130+
cwl_client = sagemaker_session.boto_session.client('logs')
131+
cwl_client.delete_log_group(logGroupName=log_group)
132+
LOGGER.info('deleted cloudwatch log group: {}'.format(log_group))
133+
except Exception:
134+
LOGGER.exception('Failure occurred while cleaning up cloudwatch log group %s. '
135+
'Swallowing exception but printing stacktrace for debugging.', log_group)
1.36 KB
Binary file not shown.

0 commit comments

Comments
 (0)