update integ tests

laurenyu · laurenyu · commit f34b128cc158 · 2018-10-29T09:20:46.000-07:00
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -22,7 +22,6 @@
 from sagemaker.local import LocalSession
 from sagemaker.chainer.defaults import CHAINER_VERSION
 from sagemaker.pytorch.defaults import PYTORCH_VERSION
-from sagemaker.mxnet.defaults import MXNET_VERSION
 from sagemaker.tensorflow.defaults import TF_VERSION
 
 
@@ -34,7 +33,7 @@ def pytest_addoption(parser):
     parser.addoption('--sagemaker-runtime-config', action='store', default=None)
     parser.addoption('--boto-config', action='store', default=None)
     parser.addoption('--tf-full-version', action='store', default=TF_VERSION)
-    parser.addoption('--mxnet-full-version', action='store', default=MXNET_VERSION)
+    parser.addoption('--mxnet-full-version', action='store', default='1.3.0')
     parser.addoption('--chainer-full-version', action='store', default=CHAINER_VERSION)
     parser.addoption('--pytorch-full-version', action='store', default=PYTORCH_VERSION)
 
@@ -86,7 +85,8 @@ def tf_version(request):
     return request.param
 
 
-@pytest.fixture(scope='module', params=['0.12', '0.12.1', '1.0', '1.0.0', '1.1', '1.1.0', '1.2', '1.2.1'])
+@pytest.fixture(scope='module', params=['0.12', '0.12.1', '1.0', '1.0.0', '1.1', '1.1.0', '1.2',
+                                        '1.2.1', '1.3', '1.3.0'])
 def mxnet_version(request):
     return request.param
 
diff --git a/tests/data/mxnet_mnist/failure_script.py b/tests/data/mxnet_mnist/failure_script.py
@@ -12,6 +12,6 @@
 # language governing permissions and limitations under the License.
 from __future__ import absolute_import
 
-def train(**kwargs):
-    """For use with integration tests expecting failures."""
-    raise Exception('This failure is expected.')
+
+# For use with integration tests expecting failures.
+raise Exception('This failure is expected.')
diff --git a/tests/data/mxnet_mnist/mnist.py b/tests/data/mxnet_mnist/mnist.py
@@ -1,5 +1,3 @@
-# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
-#
 # Licensed under the Apache License, Version 2.0 (the "License"). You
 # may not use this file except in compliance with the License. A copy of
 # the License is located at
@@ -12,13 +10,17 @@
 # language governing permissions and limitations under the License.
 from __future__ import absolute_import
 
+import argparse
+import gzip
+import json
 import logging
+import os
+import struct
 
-import gzip
 import mxnet as mx
 import numpy as np
-import os
-import struct
+
+from sagemaker_mxnet_container.training_utils import scheduler_host
 
 
 def load_data(path):
@@ -56,23 +58,70 @@ def get_train_context(num_gpus):
         return mx.cpu()
 
 
-def train(channel_input_dirs, hyperparameters, hosts, num_gpus, **kwargs):
-    (train_labels, train_images) = load_data(os.path.join(channel_input_dirs['train']))
-    (test_labels, test_images) = load_data(os.path.join(channel_input_dirs['test']))
-    batch_size = 100
-    train_iter = mx.io.NDArrayIter(train_images, train_labels, batch_size, shuffle=True)
+def train(batch_size, epochs, learning_rate, num_gpus, training_channel, testing_channel,
+          hosts, current_host, model_dir):
+    (train_labels, train_images) = load_data(training_channel)
+    (test_labels, test_images) = load_data(testing_channel)
+
+    # Data parallel training - shard the data so each host
+    # only trains on a subset of the total data.
+    shard_size = len(train_images) // len(hosts)
+    for i, host in enumerate(hosts):
+        if host == current_host:
+            start = shard_size * i
+            end = start + shard_size
+            break
+
+    train_iter = mx.io.NDArrayIter(train_images[start:end], train_labels[start:end], batch_size,
+                                   shuffle=True)
     val_iter = mx.io.NDArrayIter(test_images, test_labels, batch_size)
+
     logging.getLogger().setLevel(logging.DEBUG)
+
     kvstore = 'local' if len(hosts) == 1 else 'dist_sync'
-    mlp_model = mx.mod.Module(
-        symbol=build_graph(),
-        context=get_train_context(num_gpus))
+
+    mlp_model = mx.mod.Module(symbol=build_graph(),
+                              context=get_train_context(num_gpus))
     mlp_model.fit(train_iter,
                   eval_data=val_iter,
                   kvstore=kvstore,
                   optimizer='sgd',
-                  optimizer_params={'learning_rate': float(hyperparameters.get("learning_rate", 0.1))},
+                  optimizer_params={'learning_rate': learning_rate},
                   eval_metric='acc',
                   batch_end_callback=mx.callback.Speedometer(batch_size, 100),
-                  num_epoch=1)
-    return mlp_model
+                  num_epoch=epochs)
+
+    if len(hosts) == 1 or current_host == scheduler_host(hosts):
+        save(model_dir, mlp_model)
+
+
+def save(model_dir, model):
+    model.symbol.save(os.path.join(model_dir, 'model-symbol.json'))
+    model.save_params(os.path.join(model_dir, 'model-0000.params'))
+
+    signature = [{'name': data_desc.name, 'shape': [dim for dim in data_desc.shape]}
+                 for data_desc in model.data_shapes]
+    with open(os.path.join(model_dir, 'model-shapes.json'), 'w') as f:
+        json.dump(signature, f)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--batch-size', type=int, default=100)
+    parser.add_argument('--epochs', type=int, default=10)
+    parser.add_argument('--learning-rate', type=float, default=0.1)
+
+    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
+    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
+    parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])
+
+    parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST'])
+    parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS']))
+
+    args = parser.parse_args()
+
+    num_gpus = int(os.environ['SM_NUM_GPUS'])
+
+    train(args.batch_size, args.epochs, args.learning_rate, num_gpus, args.train, args.test,
+          args.hosts, args.current_host, args.model_dir)
diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py
@@ -191,6 +191,7 @@ def test_tuning_mxnet(sagemaker_session):
                           py_version=PYTHON_VERSION,
                           train_instance_count=1,
                           train_instance_type='ml.m4.xlarge',
+                          framework_version='1.2.1',
                           sagemaker_session=sagemaker_session,
                           base_job_name='tune-mxnet')