Fix broken test test_distributed_mnist_no_ps (#156)

icywang86rui · web-flow · commit ec07c35b3f7f · 2019-01-28T13:19:39.000-08:00
This test shouldn't save checkpoints since the two hosts are justing running
training jobs independently. The checkpoints interfere with each other. Changing
the test to use the Keras mnist script here.

This change also changed the saved model path to /opt/ml/opt so we can just use
the estimator.model_data path to assert the model exists.
diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py
@@ -37,13 +37,12 @@ def test_mnist(sagemaker_session, ecr_image, instance_type):
         path=os.path.join(resource_path, 'mnist', 'data'),
         key_prefix='scriptmode/mnist')
     estimator.fit(inputs)
-    model_s3_url = estimator.create_model().model_data
-    _assert_s3_file_exists(model_s3_url)
+    _assert_s3_file_exists(estimator.model_data)
 
 
 def test_distributed_mnist_no_ps(sagemaker_session, ecr_image, instance_type):
     resource_path = os.path.join(os.path.dirname(__file__), '../..', 'resources')
-    script = os.path.join(resource_path, 'mnist', 'mnist_estimator.py')
+    script = os.path.join(resource_path, 'mnist', 'mnist.py')
     estimator = TensorFlow(entry_point=script,
                            role='SageMakerRole',
                            train_instance_count=2,
@@ -54,10 +53,9 @@ def test_distributed_mnist_no_ps(sagemaker_session, ecr_image, instance_type):
                            py_version='py3',
                            base_job_name='test-tf-sm-distributed-mnist')
     inputs = estimator.sagemaker_session.upload_data(
-        path=os.path.join(resource_path, 'mnist', 'data-distributed'),
-        key_prefix='scriptmode/mnist-distributed')
+        path=os.path.join(resource_path, 'mnist', 'data'),
+        key_prefix='scriptmode/mnist')
     estimator.fit(inputs)
-    _assert_checkpoint_exists(estimator.model_dir, 0)
     _assert_s3_file_exists(estimator.model_data)
 
 
diff --git a/test/resources/mnist/mnist.py b/test/resources/mnist/mnist.py
@@ -2,6 +2,7 @@
 import argparse
 import os
 import numpy as np
+import json
 
 
 def _parse_args():
@@ -11,10 +12,11 @@ def _parse_args():
     # hyperparameters sent by the client are passed as command-line arguments to the script.
     parser.add_argument('--epochs', type=int, default=1)
     # Data, model, and output directories
-    parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
     parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
     parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
-    
+    parser.add_argument('--hosts', type=list, default=json.loads(os.environ['SM_HOSTS']))
+    parser.add_argument('--current-host', type=str, default=os.environ['SM_CURRENT_HOST'])
+
     return parser.parse_known_args()
 
 
@@ -46,4 +48,5 @@ def _load_testing_data(base_dir):
 x_test, y_test = _load_testing_data(args.train)
 model.fit(x_train, y_train, epochs=args.epochs)
 model.evaluate(x_test, y_test)
-model.save(os.path.join(args.model_dir, 'my_model.h5'))
+if args.current_host == args.hosts[0]:
+    model.save(os.path.join('/opt/ml/model', 'my_model.h5'))