Skip to content

Bug fix for getting dataframes in TrainingJobAnalytics. #441

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Nov 14, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
249e0d6
Bug fix for getting dataframes in TrainingJobAnalytics.
Oct 24, 2018
8401afa
Addressing comments for Bug fix for getting dataframes in TrainingJob…
Oct 25, 2018
42d2741
Unit tests for Bug fix for getting dataframes in TrainingJobAnalytics.
Oct 25, 2018
1640264
Merge branch 'master' into TrainingJobAnalytics
laurenyu Oct 26, 2018
d43047d
updating change log for Bug fix for getting dataframes in TrainingJob…
Nov 4, 2018
c55e7ca
Add model parameters to Estimator, and bump library version to 1.13.0…
RodrigoAtAWS Nov 1, 2018
5bdb50a
local mode: improve training input/output (#449)
iquintero Nov 2, 2018
3d103b4
Add image URIs for built-in Algorithms for SIN/LHR/BOM/SFO/YUL (#456)
pdasamzn Nov 5, 2018
23851b1
Support MXNet 1.3 with its training script format changes (#446)
laurenyu Nov 5, 2018
bd1f43b
Make InputDataConfig optional for training. (#459)
nadiaya Nov 6, 2018
2edcc3a
add tfs container support (#460)
jesterhazy Nov 7, 2018
10b7e42
simplify create_image_uri function (#462)
jesterhazy Nov 7, 2018
e8bf717
bump version to 1.14.0 (#463)
jesterhazy Nov 7, 2018
eea2ad5
Skip gpu tests in regions without ml.p2.xlarge (#461)
yangaws Nov 7, 2018
c8147a9
Adding Object2Vec support to SageMaker Python SDK (#467)
pnpnpn Nov 8, 2018
52d1ec4
fix readme rendering (#464)
jesterhazy Nov 8, 2018
3a69cf6
Add Pylint (#465)
iquintero Nov 8, 2018
9a997a5
Support optional input channels in local mode. (#466)
nadiaya Nov 9, 2018
6a1d93c
build: upgrade docker-compose to 1.23 (#470)
iquintero Nov 9, 2018
876287e
add tensorflow serving docs (#468)
jesterhazy Nov 10, 2018
0c4ad9c
Very minor: Better documentation comment on DeferredError. (#469)
leopd Nov 11, 2018
1df9317
Update empty framework_version warning (#472)
laurenyu Nov 13, 2018
a9ed02e
Remove hardcoded 'training' in error message for checking job status …
laurenyu Nov 13, 2018
3fd5d9a
Bump version to 1.14.2 (#477)
iquintero Nov 13, 2018
ce1de18
Add missing changelog entry for 1.14.2 (#479)
iquintero Nov 14, 2018
aaf59b9
Bug fix for getting dataframes in TrainingJobAnalytics.
Oct 24, 2018
25038d1
Resolving conflits in analytics
Nov 14, 2018
2f425ab
Merge branch 'master' into TrainingJobAnalytics
laurenyu Nov 14, 2018
93e3e48
Updating change log for version 1.14.2
Nov 14, 2018
c88611b
Merge branch 'master' into TrainingJobAnalytics
laurenyu Nov 14, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ CHANGELOG
* enhancement: Frameworks: update warning for not setting framework_version as we aren't planning a breaking change anymore
* enhancement: Session: remove hardcoded 'training' from job status error message
* bug-fix: Updated Cloudwatch namespace for metrics in TrainingJobsAnalytics
* bug-fix: Changes to use correct s3 bucket and time range for dataframes in TrainingJobAnalytics.


1.14.1
Expand Down
7 changes: 6 additions & 1 deletion src/sagemaker/analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,12 @@ def _determine_timeinterval(self):
"""
description = self._sage_client.describe_training_job(TrainingJobName=self.name)
start_time = description[u'TrainingStartTime'] # datetime object
end_time = description.get(u'TrainingEndTime', datetime.datetime.utcnow())
# Incrementing end time by 1 min since CloudWatch drops seconds before finding the logs.
# This results in logs being searched in the time range in which the correct log line was not present.
# Example - Log time - 2018-10-22 08:25:55
# Here calculated end time would also be 2018-10-22 08:25:55 (without 1 min addition)
# CW will consider end time as 2018-10-22 08:25 and will not be able to search the correct log.
end_time = description.get(u'TrainingEndTime', datetime.datetime.utcnow()) + datetime.timedelta(minutes=1)
return {
'start_time': start_time,
'end_time': end_time,
Expand Down
57 changes: 50 additions & 7 deletions tests/unit/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,49 @@ def create_sagemaker_session(describe_training_result=None, list_training_result
cwm_mock = Mock(name='cloudwatch_client')
boto_mock.client = Mock(return_value=cwm_mock)
cwm_mock.get_metric_statistics = Mock(
name='get_metric_statistics',
return_value=metric_stats_results,
name='get_metric_statistics'
)
cwm_mock.get_metric_statistics.side_effect = cw_request_side_effect
return sms


def cw_request_side_effect(Namespace, MetricName, Dimensions, StartTime, EndTime, Period, Statistics):
if _is_valid_request(Namespace, MetricName, Dimensions, StartTime, EndTime, Period, Statistics):
return _metric_stats_results()


def _is_valid_request(Namespace, MetricName, Dimensions, StartTime, EndTime, Period, Statistics):
could_watch_request = {
'Namespace': Namespace,
'MetricName': MetricName,
'Dimensions': Dimensions,
'StartTime': StartTime,
'EndTime': EndTime,
'Period': Period,
'Statistics': Statistics,
}
print(could_watch_request)
return could_watch_request == cw_request()


def cw_request():
describe_training_result = _describe_training_result()
return {
'Namespace': '/aws/sagemaker/TrainingJobs',
'MetricName': 'train:acc',
'Dimensions': [
{
'Name': 'TrainingJobName',
'Value': 'my-training-job'
}
],
'StartTime': describe_training_result['TrainingStartTime'],
'EndTime': describe_training_result['TrainingEndTime'] + datetime.timedelta(minutes=1),
'Period': 60,
'Statistics': ['Average'],
}


def test_abstract_base_class():
# confirm that the abstract base class can't be instantiated directly
with pytest.raises(TypeError) as _: # noqa: F841
Expand Down Expand Up @@ -165,12 +202,15 @@ def test_trainer_name():
assert str(trainer).find("my-training-job") != -1


def test_trainer_dataframe():
describe_training_result = {
def _describe_training_result():
return {
'TrainingStartTime': datetime.datetime(2018, 5, 16, 1, 2, 3),
'TrainingEndTime': datetime.datetime(2018, 5, 16, 5, 6, 7),
}
metric_stats_results = {


def _metric_stats_results():
return {
'Datapoints': [
{
'Average': 77.1,
Expand All @@ -186,8 +226,11 @@ def test_trainer_dataframe():
},
]
}
session = create_sagemaker_session(describe_training_result=describe_training_result,
metric_stats_results=metric_stats_results)


def test_trainer_dataframe():
session = create_sagemaker_session(describe_training_result=_describe_training_result(),
metric_stats_results=_metric_stats_results())
trainer = TrainingJobAnalytics("my-training-job", ["train:acc"], sagemaker_session=session)

df = trainer.dataframe()
Expand Down