Skip to content

Commit 6e0047b

Browse files
authored
Dump + cleanup endpoint logs during integ tests (#102)
* Dump + cleanup endpoint logs during integ tests * Fix flake errors * Add aws logs to tox as a test dependency * Specify region for AWSLogs
1 parent 6a74d07 commit 6e0047b

File tree

5 files changed

+34
-25
lines changed

5 files changed

+34
-25
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def read(fname):
3636

3737
extras_require={
3838
'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist',
39-
'mock', 'tensorflow>=1.3.0', 'contextlib2']},
39+
'mock', 'tensorflow>=1.3.0', 'contextlib2', 'awslogs']},
4040

4141
entry_points={
4242
'console_scripts': ['sagemaker=sagemaker.cli.main:main'],

tests/integ/test_tf.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from sagemaker import Session
1919
from sagemaker.tensorflow import TensorFlow
2020
from tests.integ import DATA_DIR, REGION
21-
from tests.integ.timeout import timeout_and_delete_endpoint, timeout
21+
from tests.integ.timeout import timeout_and_delete_endpoint_by_name, timeout
2222

2323
DATA_PATH = os.path.join(DATA_DIR, 'iris', 'data')
2424

@@ -43,12 +43,14 @@ def test_tf(sagemaker_session, tf_full_version):
4343
sagemaker_session=sagemaker_session,
4444
base_job_name='test-tf')
4545

46-
inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
46+
inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
4747
estimator.fit(inputs)
4848
print('job succeeded: {}'.format(estimator.latest_training_job.name))
4949

50-
with timeout_and_delete_endpoint(estimator=estimator, minutes=20):
51-
json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')
50+
endpoint_name = estimator.latest_training_job.name
51+
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
52+
json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
53+
endpoint_name=endpoint_name)
5254

5355
features = [6.4, 3.2, 4.5, 1.5]
5456
dict_result = json_predictor.predict({'inputs': features})
@@ -81,9 +83,11 @@ def test_tf_async(sagemaker_session, tf_full_version):
8183
training_job_name = estimator.latest_training_job.name
8284
time.sleep(20)
8385

84-
with timeout_and_delete_endpoint(estimator=estimator, minutes=35):
86+
endpoint_name = training_job_name
87+
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=35):
8588
estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
86-
json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')
89+
json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
90+
endpoint_name=endpoint_name)
8791

8892
result = json_predictor.predict([6.4, 3.2, 4.5, 1.5])
8993
print('predict result: {}'.format(result))

tests/integ/test_tf_cifar.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from sagemaker import Session
2121
from sagemaker.tensorflow import TensorFlow
2222
from tests.integ import DATA_DIR, REGION
23-
from tests.integ.timeout import timeout_and_delete_endpoint, timeout
23+
from tests.integ.timeout import timeout_and_delete_endpoint_by_name, timeout
2424

2525
PICKLE_CONTENT_TYPE = 'application/python-pickle'
2626

@@ -54,7 +54,8 @@ def test_cifar(sagemaker_session, tf_full_version):
5454
estimator.fit(inputs, logs=False)
5555
print('job succeeded: {}'.format(estimator.latest_training_job.name))
5656

57-
with timeout_and_delete_endpoint(estimator=estimator, minutes=20):
57+
endpoint_name = estimator.latest_training_job.name
58+
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
5859
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge')
5960
predictor.serializer = PickleSerializer()
6061
predictor.content_type = PICKLE_CONTENT_TYPE

tests/integ/timeout.py

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import signal
1414
from contextlib import contextmanager
1515
import logging
16-
16+
from awslogs.core import AWSLogs
1717
from botocore.exceptions import ClientError
1818

1919
LOGGER = logging.getLogger('timeout')
@@ -55,21 +55,6 @@ def handler(signum, frame):
5555
signal.alarm(0)
5656

5757

58-
@contextmanager
59-
def timeout_and_delete_endpoint(estimator, seconds=0, minutes=0, hours=0):
60-
with timeout(seconds=seconds, minutes=minutes, hours=hours) as t:
61-
try:
62-
yield [t]
63-
finally:
64-
try:
65-
estimator.delete_endpoint()
66-
LOGGER.info('deleted endpoint')
67-
except ClientError as ce:
68-
if ce.response['Error']['Code'] == 'ValidationException':
69-
# avoids the inner exception to be overwritten
70-
pass
71-
72-
7358
@contextmanager
7459
def timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, seconds=0, minutes=0, hours=0):
7560
with timeout(seconds=seconds, minutes=minutes, hours=hours) as t:
@@ -79,7 +64,25 @@ def timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, second
7964
try:
8065
sagemaker_session.delete_endpoint(endpoint_name)
8166
LOGGER.info('deleted endpoint {}'.format(endpoint_name))
67+
_cleanup_endpoint_logs(endpoint_name, sagemaker_session)
8268
except ClientError as ce:
8369
if ce.response['Error']['Code'] == 'ValidationException':
8470
# avoids the inner exception to be overwritten
8571
pass
72+
73+
74+
def _cleanup_endpoint_logs(endpoint_name, sagemaker_session):
75+
log_group = '/aws/sagemaker/Endpoints/{}'.format(endpoint_name)
76+
try:
77+
# print out logs before deletion for debuggability
78+
LOGGER.info('cloudwatch logs for log group {}:'.format(log_group))
79+
logs = AWSLogs(log_group_name=log_group, log_stream_name='ALL', start='1d',
80+
aws_region=sagemaker_session.boto_session.region_name)
81+
logs.list_logs()
82+
83+
cwl_client = sagemaker_session.boto_session.client('logs')
84+
cwl_client.delete_log_group(logGroupName=log_group)
85+
LOGGER.info('deleted cloudwatch log group: {}'.format(log_group))
86+
except Exception:
87+
LOGGER.exception('Failure occurred while cleaning up cloudwatch log group %s. ' +
88+
'Swallowing exception but printing stacktrace for debugging.', log_group)

tox.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ deps =
4545
mock
4646
contextlib2
4747
teamcity-messages
48+
awslogs
4849

4950
[testenv:flake8]
5051
basepython = python

0 commit comments

Comments
 (0)