Skip to content

Commit 311e164

Browse files
authored
Merge branch 'master' into tf-script-mode
2 parents aca3958 + 4153e41 commit 311e164

File tree

15 files changed

+754
-67
lines changed

15 files changed

+754
-67
lines changed

.pylintrc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,3 +84,7 @@ dummy-variables-rgx=_|unused_
8484
# Apply logging string format checks to calls on these modules.
8585
logging-modules=
8686
logging
87+
88+
[TYPECHECK]
89+
ignored-modules=
90+
distutils

CHANGELOG.rst

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,13 @@
22
CHANGELOG
33
=========
44

5-
1.14.2-dev
6-
==========
5+
1.14.3-dev
6+
=====
7+
8+
* bug-fix: Local Mode: correctly handle the case where the model output folder doesn't exist yet
9+
10+
1.14.2
11+
=====
712

813
* bug-fix: support ``CustomAttributes`` argument in local mode ``invoke_endpoint`` requests
914
* enhancement: add ``content_type`` parameter to ``sagemaker.tensorflow.serving.Predictor``
@@ -14,6 +19,10 @@ CHANGELOG
1419
* build: upgrade docker-compose to 1.23
1520
* enhancement: Frameworks: update warning for not setting framework_version as we aren't planning a breaking change anymore
1621
* feature: Estimator: add script mode and Python 3 support for TensorFlow
22+
* enhancement: Session: remove hardcoded 'training' from job status error message
23+
* bug-fix: Updated Cloudwatch namespace for metrics in TrainingJobsAnalytics
24+
* bug-fix: Changes to use correct s3 bucket and time range for dataframes in TrainingJobAnalytics.
25+
1726

1827
1.14.1
1928
======

src/sagemaker/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,4 @@
3636
from sagemaker.session import s3_input # noqa: F401
3737
from sagemaker.session import get_execution_role # noqa: F401
3838

39-
__version__ = '1.14.1'
39+
__version__ = '1.14.2'

src/sagemaker/analytics.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,12 @@ def _determine_timeinterval(self):
246246
"""
247247
description = self._sage_client.describe_training_job(TrainingJobName=self.name)
248248
start_time = description[u'TrainingStartTime'] # datetime object
249-
end_time = description.get(u'TrainingEndTime', datetime.datetime.utcnow())
249+
# Incrementing end time by 1 min since CloudWatch drops seconds before finding the logs.
250+
# This results in logs being searched in the time range in which the correct log line was not present.
251+
# Example - Log time - 2018-10-22 08:25:55
252+
# Here calculated end time would also be 2018-10-22 08:25:55 (without 1 min addition)
253+
# CW will consider end time as 2018-10-22 08:25 and will not be able to search the correct log.
254+
end_time = description.get(u'TrainingEndTime', datetime.datetime.utcnow()) + datetime.timedelta(minutes=1)
250255
return {
251256
'start_time': start_time,
252257
'end_time': end_time,

src/sagemaker/job.py

Lines changed: 32 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,9 @@ def wait(self):
5151
pass
5252

5353
@staticmethod
54-
def _load_config(inputs, estimator):
55-
input_config = _Job._format_inputs_to_input_config(inputs)
56-
role = estimator.sagemaker_session.expand_role(estimator.role)
54+
def _load_config(inputs, estimator, expand_role=True, validate_uri=True):
55+
input_config = _Job._format_inputs_to_input_config(inputs, validate_uri)
56+
role = estimator.sagemaker_session.expand_role(estimator.role) if expand_role else estimator.role
5757
output_config = _Job._prepare_output_config(estimator.output_path, estimator.output_kms_key)
5858
resource_config = _Job._prepare_resource_config(estimator.train_instance_count,
5959
estimator.train_instance_type,
@@ -62,7 +62,8 @@ def _load_config(inputs, estimator):
6262
stop_condition = _Job._prepare_stop_condition(estimator.train_max_run)
6363
vpc_config = estimator.get_vpc_config()
6464

65-
model_channel = _Job._prepare_model_channel(input_config, estimator.model_uri, estimator.model_channel_name)
65+
model_channel = _Job._prepare_model_channel(input_config, estimator.model_uri, estimator.model_channel_name,
66+
validate_uri)
6667
if model_channel:
6768
input_config = [] if input_config is None else input_config
6869
input_config.append(model_channel)
@@ -75,7 +76,7 @@ def _load_config(inputs, estimator):
7576
'vpc_config': vpc_config}
7677

7778
@staticmethod
78-
def _format_inputs_to_input_config(inputs):
79+
def _format_inputs_to_input_config(inputs, validate_uri=True):
7980
if inputs is None:
8081
return None
8182

@@ -86,14 +87,14 @@ def _format_inputs_to_input_config(inputs):
8687

8788
input_dict = {}
8889
if isinstance(inputs, string_types):
89-
input_dict['training'] = _Job._format_string_uri_input(inputs)
90+
input_dict['training'] = _Job._format_string_uri_input(inputs, validate_uri)
9091
elif isinstance(inputs, s3_input):
9192
input_dict['training'] = inputs
9293
elif isinstance(inputs, file_input):
9394
input_dict['training'] = inputs
9495
elif isinstance(inputs, dict):
9596
for k, v in inputs.items():
96-
input_dict[k] = _Job._format_string_uri_input(v)
97+
input_dict[k] = _Job._format_string_uri_input(v, validate_uri)
9798
elif isinstance(inputs, list):
9899
input_dict = _Job._format_record_set_list_input(inputs)
99100
else:
@@ -111,15 +112,16 @@ def _convert_input_to_channel(channel_name, channel_s3_input):
111112
return channel_config
112113

113114
@staticmethod
114-
def _format_string_uri_input(uri_input):
115-
if isinstance(uri_input, str):
116-
if uri_input.startswith('s3://'):
117-
return s3_input(uri_input)
118-
elif uri_input.startswith('file://'):
119-
return file_input(uri_input)
120-
else:
121-
raise ValueError('Training input data must be a valid S3 or FILE URI: must start with "s3://" or '
122-
'"file://"')
115+
def _format_string_uri_input(uri_input, validate_uri=True):
116+
if isinstance(uri_input, str) and validate_uri and uri_input.startswith('s3://'):
117+
return s3_input(uri_input)
118+
elif isinstance(uri_input, str) and validate_uri and uri_input.startswith('file://'):
119+
return file_input(uri_input)
120+
elif isinstance(uri_input, str) and validate_uri:
121+
raise ValueError('Training input data must be a valid S3 or FILE URI: must start with "s3://" or '
122+
'"file://"')
123+
elif isinstance(uri_input, str):
124+
return s3_input(uri_input)
123125
elif isinstance(uri_input, s3_input):
124126
return uri_input
125127
elif isinstance(uri_input, file_input):
@@ -128,7 +130,7 @@ def _format_string_uri_input(uri_input):
128130
raise ValueError('Cannot format input {}. Expecting one of str, s3_input, or file_input'.format(uri_input))
129131

130132
@staticmethod
131-
def _prepare_model_channel(input_config, model_uri=None, model_channel_name=None):
133+
def _prepare_model_channel(input_config, model_uri=None, model_channel_name=None, validate_uri=True):
132134
if not model_uri:
133135
return
134136
elif not model_channel_name:
@@ -139,22 +141,24 @@ def _prepare_model_channel(input_config, model_uri=None, model_channel_name=None
139141
if channel['ChannelName'] == model_channel_name:
140142
raise ValueError('Duplicate channels not allowed.')
141143

142-
model_input = _Job._format_model_uri_input(model_uri)
144+
model_input = _Job._format_model_uri_input(model_uri, validate_uri)
143145
model_channel = _Job._convert_input_to_channel(model_channel_name, model_input)
144146

145147
return model_channel
146148

147149
@staticmethod
148-
def _format_model_uri_input(model_uri):
149-
if isinstance(model_uri, string_types):
150-
if model_uri.startswith('s3://'):
151-
return s3_input(model_uri, input_mode='File', distribution='FullyReplicated',
152-
content_type='application/x-sagemaker-model')
153-
elif model_uri.startswith('file://'):
154-
return file_input(model_uri)
155-
else:
156-
raise ValueError('Model URI must be a valid S3 or FILE URI: must start with "s3://" or '
157-
'"file://')
150+
def _format_model_uri_input(model_uri, validate_uri=True):
151+
if isinstance(model_uri, string_types)and validate_uri and model_uri.startswith('s3://'):
152+
return s3_input(model_uri, input_mode='File', distribution='FullyReplicated',
153+
content_type='application/x-sagemaker-model')
154+
elif isinstance(model_uri, string_types) and validate_uri and model_uri.startswith('file://'):
155+
return file_input(model_uri)
156+
elif isinstance(model_uri, string_types) and validate_uri:
157+
raise ValueError('Model URI must be a valid S3 or FILE URI: must start with "s3://" or '
158+
'"file://')
159+
elif isinstance(model_uri, string_types):
160+
return s3_input(model_uri, input_mode='File', distribution='FullyReplicated',
161+
content_type='application/x-sagemaker-model')
158162
else:
159163
raise ValueError('Cannot format model URI {}. Expecting str'.format(model_uri))
160164

src/sagemaker/local/utils.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import os
1616
import shutil
1717

18+
from distutils.dir_util import copy_tree
1819
from six.moves.urllib.parse import urlparse
1920

2021

@@ -45,6 +46,7 @@ def move_to_destination(source, destination, job_name, sagemaker_session):
4546
Args:
4647
source (str): root directory to move
4748
destination (str): file:// or s3:// URI that source will be moved to.
49+
job_name (str): SageMaker job name.
4850
sagemaker_session (sagemaker.Session): a sagemaker_session to interact with S3 if needed
4951
5052
Returns:
@@ -67,19 +69,12 @@ def move_to_destination(source, destination, job_name, sagemaker_session):
6769

6870

6971
def recursive_copy(source, destination):
70-
"""Similar to shutil.copy but the destination directory can exist. Existing files will be overriden.
72+
"""A wrapper around distutils.dir_util.copy_tree but won't throw any exception when the source
73+
directory does not exist.
74+
7175
Args:
7276
source (str): source path
7377
destination (str): destination path
7478
"""
75-
for root, dirs, files in os.walk(source):
76-
root = os.path.relpath(root, source)
77-
current_path = os.path.join(source, root)
78-
target_path = os.path.join(destination, root)
79-
80-
for file in files:
81-
shutil.copy(os.path.join(current_path, file), os.path.join(target_path, file))
82-
for d in dirs:
83-
new_dir = os.path.join(target_path, d)
84-
if not os.path.exists(new_dir):
85-
os.mkdir(os.path.join(target_path, d))
79+
if os.path.isdir(source):
80+
copy_tree(source, destination)

src/sagemaker/session.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,11 +169,11 @@ def default_bucket(self):
169169
if self._default_bucket:
170170
return self._default_bucket
171171

172-
s3 = self.boto_session.resource('s3')
173172
account = self.boto_session.client('sts').get_caller_identity()['Account']
174173
region = self.boto_session.region_name
175174
default_bucket = 'sagemaker-{}-{}'.format(region, account)
176175

176+
s3 = self.boto_session.resource('s3')
177177
try:
178178
# 'us-east-1' cannot be specified because it is the default region:
179179
# https://github.com/boto/boto3/issues/125
@@ -636,7 +636,8 @@ def _check_job_status(self, job, desc, status_key_name):
636636

637637
if status != 'Completed' and status != 'Stopped':
638638
reason = desc.get('FailureReason', '(No reason provided)')
639-
raise ValueError('Error training {}: {} Reason: {}'.format(job, status, reason))
639+
job_type = status_key_name.replace('JobStatus', ' job')
640+
raise ValueError('Error for {} {}: {} Reason: {}'.format(job_type, job, status, reason))
640641

641642
def wait_for_endpoint(self, endpoint, poll=5):
642643
"""Wait for an Amazon SageMaker endpoint deployment to complete.

src/sagemaker/utils.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626
import six
2727

2828

29+
AIRFLOW_TIME_MACRO = "{{ execution_date.strftime('%Y-%m-%d-%H-%M-%S') }}"
30+
31+
2932
# Use the base name of the image as the job name if the user doesn't give us one
3033
def name_from_image(image):
3134
"""Create a training job name based on the image name and a timestamp.
@@ -58,6 +61,21 @@ def name_from_base(base, max_length=63, short=False):
5861
return '{}-{}'.format(trimmed_base, timestamp)
5962

6063

64+
def airflow_name_from_base(base):
65+
"""Append airflow execution_date macro (https://airflow.apache.org/code.html?#macros)
66+
to the provided string. The macro will beevaluated in Airflow operator runtime.
67+
This guarantees that different operators will have same name returned by this function.
68+
69+
Args:
70+
base (str): String used as prefix to generate the unique name.
71+
72+
Returns:
73+
str: Input parameter with appended macro.
74+
"""
75+
76+
return "{}-{}".format(base, AIRFLOW_TIME_MACRO)
77+
78+
6179
def base_name_from_image(image):
6280
"""Extract the base name of the image to use as the 'algorithm name' for the job.
6381

src/sagemaker/workflow/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.

0 commit comments

Comments
 (0)