Merge pull request #134 from aidan-plenert-macdonald/master

djarpin · web-flow · commit eb056450691f · 2017-12-05T14:57:54.000-08:00
Make the MXNet Gluon MNIST Example scale
diff --git a/sagemaker-python-sdk/mxnet_gluon_mnist/mnist.py b/sagemaker-python-sdk/mxnet_gluon_mnist/mnist.py
@@ -16,7 +16,7 @@
 # ------------------------------------------------------------ #
 
 
-def train(channel_input_dirs, hyperparameters, **kwargs):
+def train(channel_input_dirs, hyperparameters, hosts, num_gpus, **kwargs):
     # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
     # the current container environment, but here we just use simple cpu context.
     ctx = mx.cpu()
@@ -41,8 +41,15 @@ def train(channel_input_dirs, hyperparameters, **kwargs):
     # Collect all parameters from net and its children, then initialize them.
     net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
     # Trainer is for updating parameters with gradient.
+
+    if len(hosts) == 1:
+        kvstore = 'device' if num_gpus > 0 else 'local'
+    else:
+        kvstore = 'dist_device_sync' if num_gpus > 0 else 'dist_sync'
+
     trainer = gluon.Trainer(net.collect_params(), 'sgd',
-                            {'learning_rate': learning_rate, 'momentum': momentum})
+                            {'learning_rate': learning_rate, 'momentum': momentum},
+                            kvstore=kvstore)
     metric = mx.metric.Accuracy()
     loss = gluon.loss.SoftmaxCrossEntropyLoss()
 
diff --git a/sagemaker-python-sdk/mxnet_gluon_sentiment/sentiment.py b/sagemaker-python-sdk/mxnet_gluon_sentiment/sentiment.py
@@ -31,7 +31,7 @@ def train(current_host, hosts, num_cpus, num_gpus, channel_input_dirs, model_dir
     if len(hosts) == 1:
         kvstore = 'device' if num_gpus > 0 else 'local'
     else:
-        kvstore = 'dist_sync'
+        kvstore = 'dist_device_sync' if num_gpus > 0 else 'dist_sync'
 
     ctx = mx.gpu() if num_gpus > 0 else mx.cpu()
 
@@ -56,7 +56,8 @@ def train(current_host, hosts, num_cpus, num_gpus, channel_input_dirs, model_dir
     net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
     # Trainer is for updating parameters with gradient.
     trainer = gluon.Trainer(net.collect_params(), 'adam',
-                            {'learning_rate': learning_rate})
+                            {'learning_rate': learning_rate},
+                            kvstore=kvstore)
     metric = mx.metric.Accuracy()
     loss = gluon.loss.SoftmaxCrossEntropyLoss()