Skip to content

Commit 370e96a

Browse files
authored
Merge branch 'master' into remove_cw_metrics_arg
2 parents 226720a + e3dc3b5 commit 370e96a

29 files changed

+128
-131
lines changed

CHANGELOG.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,16 @@
22
CHANGELOG
33
=========
44

5+
1.9.1dev
6+
========
7+
8+
* bug-fix: Estimators: Fix serialization of single records
9+
10+
1.9.0
11+
=====
12+
13+
* feature: Estimators: add support for MXNet 1.2.1
14+
515
1.8.0
616
=====
717

README.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ You can install from source by cloning this repository and issuing a pip install
5151

5252
git clone https://github.com/aws/sagemaker-python-sdk.git
5353
python setup.py sdist
54-
pip install dist/sagemaker-1.8.0.tar.gz
54+
pip install dist/sagemaker-1.9.0.tar.gz
5555

5656
Supported Operating Systems
5757
~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -240,7 +240,7 @@ MXNet SageMaker Estimators
240240

241241
With MXNet Estimators, you can train and host MXNet models on Amazon SageMaker.
242242

243-
Supported versions of MXNet: ``1.1.0``, ``1.0.0``, ``0.12.1``.
243+
Supported versions of MXNet: ``1.2.1``, ``1.1.0``, ``1.0.0``, ``0.12.1``.
244244

245245
More details at `MXNet SageMaker Estimators and Models`_.
246246

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def read(fname):
2323

2424

2525
setup(name="sagemaker",
26-
version="1.8.0",
26+
version="1.9.0",
2727
description="Open source library for training and deploying models on Amazon SageMaker.",
2828
packages=find_packages('src'),
2929
package_dir={'': 'src'},

src/sagemaker/amazon/common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def __init__(self, content_type='application/x-recordio-protobuf'):
2929

3030
def __call__(self, array):
3131
if len(array.shape) == 1:
32-
array.reshape(1, array.shape[0])
32+
array = array.reshape(1, array.shape[0])
3333
assert len(array.shape) == 2, "Expecting a 1 or 2 dimensional array"
3434
buf = io.BytesIO()
3535
write_numpy_to_dense_tensor(buf, array)

src/sagemaker/chainer/README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ The following are optional arguments. When you create a ``Chainer`` object, you
158158
- ``train_volume_size`` Size in GB of the EBS volume to use for storing
159159
input data during training. Must be large enough to store training
160160
data if input_mode='File' is used (which is the default).
161-
- ``train_max_run`` Timeout in hours for training, after which Amazon
161+
- ``train_max_run`` Timeout in seconds for training, after which Amazon
162162
SageMaker terminates the job regardless of its current status.
163163
- ``input_mode`` The input mode that the algorithm supports. Valid
164164
modes: 'File' - Amazon SageMaker copies the training dataset from the

src/sagemaker/local/image.py

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import sys
2727
import tarfile
2828
import tempfile
29-
from fcntl import fcntl, F_GETFL, F_SETFL
3029
from six.moves.urllib.parse import urlparse
3130
from threading import Thread
3231

@@ -105,7 +104,7 @@ def train(self, input_data_config, hyperparameters):
105104
compose_command = self._compose()
106105

107106
_ecr_login_if_needed(self.sagemaker_session.boto_session, self.image)
108-
process = subprocess.Popen(compose_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
107+
process = subprocess.Popen(compose_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
109108

110109
try:
111110
_stream_output(process)
@@ -555,34 +554,20 @@ def __init__(self, host_dir, container_dir=None, channel=None):
555554
def _stream_output(process):
556555
"""Stream the output of a process to stdout
557556
558-
This function takes an existing process that will be polled for output. Both stdout and
559-
stderr will be polled and both will be sent to sys.stdout.
557+
This function takes an existing process that will be polled for output. Only stdout
558+
will be polled and sent to sys.stdout.
560559
561560
Args:
562561
process(subprocess.Popen): a process that has been started with
563-
stdout=PIPE and stderr=PIPE
562+
stdout=PIPE and stderr=STDOUT
564563
565564
Returns (int): process exit code
566565
"""
567566
exit_code = None
568567

569-
# Get the current flags for the stderr file descriptor
570-
# And add the NONBLOCK flag to allow us to read even if there is no data.
571-
# Since usually stderr will be empty unless there is an error.
572-
flags = fcntl(process.stderr, F_GETFL) # get current process.stderr flags
573-
fcntl(process.stderr, F_SETFL, flags | os.O_NONBLOCK)
574-
575568
while exit_code is None:
576569
stdout = process.stdout.readline().decode("utf-8")
577570
sys.stdout.write(stdout)
578-
try:
579-
stderr = process.stderr.readline().decode("utf-8")
580-
sys.stdout.write(stderr)
581-
except IOError:
582-
# If there is nothing to read on stderr we will get an IOError
583-
# this is fine.
584-
pass
585-
586571
exit_code = process.poll()
587572

588573
if exit_code != 0:

src/sagemaker/mxnet/README.rst

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ MXNet SageMaker Estimators and Models
55

66
With MXNet Estimators, you can train and host MXNet models on Amazon SageMaker.
77

8-
Supported versions of MXNet: ``1.1.0``, ``1.0.0``, ``0.12.1``.
8+
Supported versions of MXNet: ``1.2.1``, ``1.1.0``, ``1.0.0``, ``0.12.1``.
99

1010
Training with MXNet
1111
~~~~~~~~~~~~~~~~~~~
@@ -81,7 +81,7 @@ If you want to run your training script locally via the Python interpreter, look
8181
Using MXNet and numpy
8282
^^^^^^^^^^^^^^^^^^^^^
8383

84-
You can import both ``mxnet`` and ``numpy`` in your training script. When your script runs in SageMaker, it will run with access to MXNet version 1.0.0 and numpy version 1.13.3 by default. For more information on the environment your script runs in, please see `SageMaker MXNet Containers <#sagemaker-mxnet-containers>`__.
84+
You can import both ``mxnet`` and ``numpy`` in your training script. When your script runs in SageMaker, it will run with access to MXNet version 1.2.1 and numpy version 1.14.5 by default. For more information on the environment your script runs in, please see `SageMaker MXNet Containers <#sagemaker-mxnet-containers>`__.
8585

8686
Running an MXNet training script in SageMaker
8787
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -136,7 +136,7 @@ The following are optional arguments. When you create an ``MXNet`` object, you c
136136
- ``train_volume_size`` Size in GB of the EBS volume to use for storing
137137
input data during training. Must be large enough to store training
138138
data if input_mode='File' is used (which is the default).
139-
- ``train_max_run`` Timeout in hours for training, after which Amazon
139+
- ``train_max_run`` Timeout in seconds for training, after which Amazon
140140
SageMaker terminates the job regardless of its current status.
141141
- ``input_mode`` The input mode that the algorithm supports. Valid
142142
modes: 'File' - Amazon SageMaker copies the training dataset from the
@@ -581,23 +581,23 @@ When training and deploying training scripts, SageMaker runs your Python script
581581

582582
SageMaker runs MXNet Estimator scripts in either Python 2.7 or Python 3.5. You can select the Python version by passing a ``py_version`` keyword arg to the MXNet Estimator constructor. Setting this to ``py2`` (the default) will cause your training script to be run on Python 2.7. Setting this to ``py3`` will cause your training script to be run on Python 3.5. This Python version applies to both the Training Job, created by fit, and the Endpoint, created by deploy.
583583

584-
Your MXNet training script will be run on version 1.1.0 by default. (See below for how to choose a different version, and currently supported versions.) The decision to use the GPU or CPU version of MXNet is made by the ``train_instance_type``, set on the MXNet constructor. If you choose a GPU instance type, your training job will be run on a GPU version of MXNet. If you choose a CPU instance type, your training job will be run on a CPU version of MXNet. Similarly, when you call deploy, specifying a GPU or CPU deploy_instance_type, will control which MXNet build your Endpoint runs.
584+
Your MXNet training script will be run on version 1.2.1 by default. (See below for how to choose a different version, and currently supported versions.) The decision to use the GPU or CPU version of MXNet is made by the ``train_instance_type``, set on the MXNet constructor. If you choose a GPU instance type, your training job will be run on a GPU version of MXNet. If you choose a CPU instance type, your training job will be run on a CPU version of MXNet. Similarly, when you call deploy, specifying a GPU or CPU deploy_instance_type, will control which MXNet build your Endpoint runs.
585585

586586
The Docker images have the following dependencies installed:
587587

588-
+-------------------------+--------------+-------------+-------------+
589-
| Dependencies | MXNet 0.12.1 | MXNet 1.0.0 | MXNet 1.1.0 |
590-
+-------------------------+--------------+-------------+-------------+
591-
| Python | 2.7 or 3.5 | 2.7 or 3.5| 2.7 or 3.5|
592-
+-------------------------+--------------+-------------+-------------+
593-
| CUDA | 9.0 | 9.0 | 9.0 |
594-
+-------------------------+--------------+-------------+-------------+
595-
| numpy | 1.13.3 | 1.13.3 | 1.13.3 |
596-
+-------------------------+--------------+-------------+-------------+
588+
+-------------------------+--------------+-------------+-------------+-------------+
589+
| Dependencies | MXNet 0.12.1 | MXNet 1.0.0 | MXNet 1.1.0 | MXNet 1.2.1 |
590+
+-------------------------+--------------+-------------+-------------+-------------+
591+
| Python | 2.7 or 3.5 | 2.7 or 3.5| 2.7 or 3.5| 2.7 or 3.5|
592+
+-------------------------+--------------+-------------+-------------+-------------+
593+
| CUDA | 9.0 | 9.0 | 9.0 | 9.0 |
594+
+-------------------------+--------------+-------------+-------------+-------------+
595+
| numpy | 1.13.3 | 1.13.3 | 1.13.3 | 1.14.5 |
596+
+-------------------------+--------------+-------------+-------------+-------------+
597597

598598
The Docker images extend Ubuntu 16.04.
599599

600-
You can select version of MXNet by passing a ``framework_version`` keyword arg to the MXNet Estimator constructor. Currently supported versions are listed in the above table. You can also set ``framework_version`` to only specify major and minor version, e.g ``1.1``, which will cause your training script to be run on the latest supported patch version of that minor version, which in this example would be 1.1.0.
600+
You can select version of MXNet by passing a ``framework_version`` keyword arg to the MXNet Estimator constructor. Currently supported versions are listed in the above table. You can also set ``framework_version`` to only specify major and minor version, e.g ``1.2``, which will cause your training script to be run on the latest supported patch version of that minor version, which in this example would be 1.2.1.
601601
Alternatively, you can build your own image by following the instructions in the SageMaker MXNet containers repository, and passing ``image_name`` to the MXNet Estimator constructor.
602602

603603
You can visit the SageMaker MXNet containers repository here: https://github.com/aws/sagemaker-mxnet-containers/

src/sagemaker/mxnet/defaults.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@
1212
# language governing permissions and limitations under the License.
1313
from __future__ import absolute_import
1414

15-
MXNET_VERSION = '1.1'
15+
MXNET_VERSION = '1.2'

src/sagemaker/pytorch/README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ The following are optional arguments. When you create a ``PyTorch`` object, you
187187
- ``train_volume_size`` Size in GB of the EBS volume to use for storing
188188
input data during training. Must be large enough to store training
189189
data if input_mode='File' is used (which is the default).
190-
- ``train_max_run`` Timeout in hours for training, after which Amazon
190+
- ``train_max_run`` Timeout in seconds for training, after which Amazon
191191
SageMaker terminates the job regardless of its current status.
192192
- ``input_mode`` The input mode that the algorithm supports. Valid
193193
modes: 'File' - Amazon SageMaker copies the training dataset from the

src/sagemaker/tensorflow/README.rst

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -414,7 +414,7 @@ you can specify these as keyword arguments.
414414
- ``train_volume_size (int)`` Size in GB of the EBS volume to use for storing
415415
input data during training. Must be large enough to the store training
416416
data.
417-
- ``train_max_run (int)`` Timeout in hours for training, after which Amazon
417+
- ``train_max_run (int)`` Timeout in seconds for training, after which Amazon
418418
SageMaker terminates the job regardless of its current status.
419419
- ``output_path (str)`` S3 location where you want the training result (model
420420
artifacts and optional output files) saved. If not specified, results
@@ -826,6 +826,23 @@ If your TFRecords are compressed, you can train on Gzipped TF Records by passing
826826
You can learn more about ``PipeModeDataset`` in the sagemaker-tensorflow-extensions repository: https://github.com/aws/sagemaker-tensorflow-extensions
827827

828828

829+
Training with MKL-DNN disabled
830+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
831+
832+
SageMaker TensorFlow CPU images use TensorFlow built with Intel® MKL-DNN optimization.
833+
834+
In certain cases you might be able to get a better performance by disabling this optimization
835+
(`for example when using small models <https://github.com/awslabs/amazon-sagemaker-examples/blob/d88d1c19861fb7733941969f5a68821d9da2982e/sagemaker-python-sdk/tensorflow_iris_dnn_classifier_using_estimators/iris_dnn_classifier.py#L7-L9>`_)
836+
837+
You can disable MKL-DNN optimization for TensorFlow ``1.8.0`` by setting two following environment variables:
838+
839+
.. code:: python
840+
841+
import os
842+
843+
os.environ['TF_DISABLE_MKL'] = '1'
844+
os.environ['TF_DISABLE_POOL_ALLOCATOR'] = '1'
845+
829846
830847
SageMaker TensorFlow Docker containers
831848
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

tests/conftest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def tf_version(request):
7676
return request.param
7777

7878

79-
@pytest.fixture(scope='module', params=['0.12', '0.12.1', '1.0', '1.0.0', '1.1', '1.1.0'])
79+
@pytest.fixture(scope='module', params=['0.12', '0.12.1', '1.0', '1.0.0', '1.1', '1.1.0', '1.2', '1.2.1'])
8080
def mxnet_version(request):
8181
return request.param
8282

@@ -96,7 +96,7 @@ def tf_full_version(request):
9696
return request.param
9797

9898

99-
@pytest.fixture(scope='module', params=['0.12.1', '1.0.0', '1.1.0'])
99+
@pytest.fixture(scope='module', params=['0.12.1', '1.0.0', '1.1.0', '1.2.1'])
100100
def mxnet_full_version(request):
101101
return request.param
102102

tests/data/dummy_tensor

311 KB
Binary file not shown.

tests/integ/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
import os
1717

1818
DATA_DIR = os.path.join(os.path.dirname(__file__), '..', 'data')
19+
TRAINING_DEFAULT_TIMEOUT_MINUTES = 20
20+
TUNING_DEFAULT_TIMEOUT_MINUTES = 20
1921

2022
logging.getLogger('boto3').setLevel(logging.INFO)
2123
logging.getLogger('botocore').setLevel(logging.INFO)

tests/integ/test_byo_estimator.py

Lines changed: 10 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -13,22 +13,18 @@
1313
from __future__ import absolute_import
1414

1515
import gzip
16-
import io
1716
import json
1817
import os
1918
import pickle
2019
import sys
2120

22-
import boto3
23-
import numpy as np
2421
import pytest
2522

2623
import sagemaker
2724
from sagemaker.amazon.amazon_estimator import registry
28-
from sagemaker.amazon.common import write_numpy_to_dense_tensor
2925
from sagemaker.estimator import Estimator
3026
from sagemaker.utils import name_from_base
31-
from tests.integ import DATA_DIR
27+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
3228
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
3329

3430

@@ -57,27 +53,20 @@ def test_byo_estimator(sagemaker_session, region):
5753
5854
"""
5955
image_name = registry(region) + "/factorization-machines:1"
56+
training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')
6057

61-
with timeout(minutes=15):
58+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
6259
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
6360
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
6461

6562
with gzip.open(data_path, 'rb') as f:
6663
train_set, _, _ = pickle.load(f, **pickle_args)
6764

68-
# take 100 examples for faster execution
69-
vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32')
70-
labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32')
71-
72-
buf = io.BytesIO()
73-
write_numpy_to_dense_tensor(buf, vectors, labels)
74-
buf.seek(0)
75-
76-
bucket = sagemaker_session.default_bucket()
7765
prefix = 'test_byo_estimator'
7866
key = 'recordio-pb-data'
79-
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
80-
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
67+
68+
s3_train_data = sagemaker_session.upload_data(path=training_data_path,
69+
key_prefix=os.path.join(prefix, 'train', key))
8170

8271
estimator = Estimator(image_name=image_name,
8372
role='SageMakerRole', train_instance_count=1,
@@ -111,6 +100,7 @@ def test_byo_estimator(sagemaker_session, region):
111100
def test_async_byo_estimator(sagemaker_session, region):
112101
image_name = registry(region) + "/factorization-machines:1"
113102
endpoint_name = name_from_base('byo')
103+
training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')
114104
training_job_name = ""
115105

116106
with timeout(minutes=5):
@@ -120,19 +110,11 @@ def test_async_byo_estimator(sagemaker_session, region):
120110
with gzip.open(data_path, 'rb') as f:
121111
train_set, _, _ = pickle.load(f, **pickle_args)
122112

123-
# take 100 examples for faster execution
124-
vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32')
125-
labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32')
126-
127-
buf = io.BytesIO()
128-
write_numpy_to_dense_tensor(buf, vectors, labels)
129-
buf.seek(0)
130-
131-
bucket = sagemaker_session.default_bucket()
132113
prefix = 'test_byo_estimator'
133114
key = 'recordio-pb-data'
134-
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
135-
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
115+
116+
s3_train_data = sagemaker_session.upload_data(path=training_data_path,
117+
key_prefix=os.path.join(prefix, 'train', key))
136118

137119
estimator = Estimator(image_name=image_name,
138120
role='SageMakerRole', train_instance_count=1,

0 commit comments

Comments
 (0)