Create parameter server in different thread (#129)

icywang86rui · web-flow · commit 962f15bd1dda · 2018-11-26T21:43:27.000-08:00
* Create parameter server in different thread
* Fixing some integ tests
diff --git a/setup.py b/setup.py
@@ -53,6 +53,6 @@ def read(fname):
                       'pandas', 'Pillow', 'h5py'],
     extras_require={
         'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock',
-                 'sagemaker', 'tensorflow', 'docker-compose']
+                 'sagemaker>=1.15.2', 'tensorflow', 'docker-compose']
     },
 )
diff --git a/src/sagemaker_tensorflow_container/training.py b/src/sagemaker_tensorflow_container/training.py
@@ -17,16 +17,16 @@
 import logging
 import os
 import subprocess
+import threading
 import time
 
 import sagemaker_containers.beta.framework as framework
+import tensorflow as tf
 
-import sagemaker_tensorflow_container.s3_utils as s3_utils
-
+from sagemaker_tensorflow_container import s3_utils
 
 logger = logging.getLogger(__name__)
 
-
 SAGEMAKER_PARAMETER_SERVER_ENABLED = 'sagemaker_parameter_server_enabled'
 
 
@@ -88,30 +88,21 @@ def host_addresses(hosts, port=2222):
     return tf_config
 
 
-def _env_vars_with_tf_config(env, ps_task):
+def _run_ps(env, cluster):
+    logger.info('Running distributed training job with parameter servers')
+
+    cluster_spec = tf.train.ClusterSpec(cluster)
+    task_index = env.hosts.index(env.current_host)
+
+    server = tf.train.Server(cluster_spec, job_name='ps', task_index=task_index)
+
+    threading.Thread(target=lambda: server.join()).start()
+
+
+def _run_worker(env, tf_config):
     env_vars = env.to_env_vars()
-    env_vars['TF_CONFIG'] = json.dumps(_build_tf_config(
-        hosts=env.hosts,
-        current_host=env.current_host,
-        ps_task=ps_task))
-    return env_vars
-
-
-def _run_ps(env):
-    env_vars = _env_vars_with_tf_config(env, ps_task=True)
-    # Parameter server processes should always run on CPU. Sets CUDA_VISIBLE_DEVICES to '-1' forces
-    # TensorFlow to use CPU.
-    env_vars['CUDA_VISIBLE_DEVICES'] = json.dumps(-1)
-    framework.entry_point.run(env.module_dir, env.user_entry_point,
-                              env.to_cmd_args(), env_vars, wait=False)
-
-
-def _run_worker(env):
-    # when _run_ps is called CUDA_VISIBLE_DEVICES is set with os.environ.
-    # We need to unset it so the worker process can use the GPUs.
-    if os.environ.get('CUDA_VISIBLE_DEVICES'):
-        del os.environ['CUDA_VISIBLE_DEVICES']
-    env_vars = _env_vars_with_tf_config(env, ps_task=False)
+    env_vars['TF_CONFIG'] = json.dumps(tf_config)
+
     framework.entry_point.run(env.module_dir, env.user_entry_point, env.to_cmd_args(), env_vars)
 
 
@@ -137,11 +128,13 @@ def train(env):
         SAGEMAKER_PARAMETER_SERVER_ENABLED, False)
     if len(env.hosts) > 1 and parameter_server_enabled:
 
+        tf_config = _build_tf_config(hosts=env.hosts, current_host=env.current_host)
+
         logger.info('Running distributed training job with parameter servers')
         logger.info('Launching parameter server process')
-        _run_ps(env)
+        _run_ps(env, tf_config['cluster'])
         logger.info('Launching worker process')
-        _run_worker(env)
+        _run_worker(env, tf_config)
 
         if not _is_host_master(env.hosts, env.current_host):
             _wait_until_master_is_down(env.hosts[0])
diff --git a/test/integration/local/test_keras.py b/test/integration/local/test_keras.py
@@ -33,6 +33,7 @@ def test_keras_training(sagemaker_local_session, docker_image, tmpdir):
         role='SageMakerRole',
         train_instance_count=1,
         train_instance_type='local',
+        image_name=docker_image,
         sagemaker_session=sagemaker_local_session,
         model_dir='/opt/ml/model',
         output_path=output_path,
@@ -41,7 +42,8 @@ def test_keras_training(sagemaker_local_session, docker_image, tmpdir):
 
     estimator.fit()
 
-    model = serving.Model(model_data=output_path, role='SageMakerRole',
+    model = serving.Model(model_data=output_path,
+                          role='SageMakerRole',
                           framework_version='1.11.0',
                           sagemaker_session=sagemaker_local_session)
 
diff --git a/test/integration/local/test_training.py b/test/integration/local/test_training.py
@@ -16,12 +16,10 @@
 import tarfile
 
 import pytest
-from sagemaker.estimator import Framework
 from sagemaker.tensorflow import TensorFlow
 
 from test.integration.docker_utils import Container
 
-
 RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
 TF_CHECKPOINT_FILES = ['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta']
 
@@ -94,34 +92,27 @@ def test_distributed_training_cpu_ps(sagemaker_local_session, docker_image, tmpd
     _assert_files_exist_in_tar(output_path, TF_CHECKPOINT_FILES)
 
 
-class ScriptModeTensorFlow(Framework):
-    """This class is temporary until the final version of Script Mode is released.
-    """
-
-    __framework_name__ = "tensorflow-scriptmode-beta"
-
-    create_model = TensorFlow.create_model
-
-    def __init__(self, py_version='py', **kwargs):
-        self.requirements_file = None
-        self.py_version = py_version
-        self.framework_version = 'some version'
-        super(ScriptModeTensorFlow, self).__init__(**kwargs)
-
-
-def run_tf_training(script, instance_type, instance_count,
+def run_tf_training(script,
+                    instance_type,
+                    instance_count,
                     sagemaker_local_session,
                     docker_image, training_data_path, output_path=None,
-                    hyperparameters={}):
-    estimator = ScriptModeTensorFlow(entry_point=script,
-                                     role='SageMakerRole',
-                                     train_instance_count=instance_count,
-                                     train_instance_type=instance_type,
-                                     sagemaker_session=sagemaker_local_session,
-                                     image_name=docker_image,
-                                     output_path=output_path,
-                                     hyperparameters=hyperparameters,
-                                     base_job_name='test-tf')
+                    hyperparameters=None):
+
+    hyperparameters = hyperparameters or {}
+
+    estimator = TensorFlow(entry_point=script,
+                           role='SageMakerRole',
+                           train_instance_count=instance_count,
+                           train_instance_type=instance_type,
+                           sagemaker_session=sagemaker_local_session,
+                           image_name=docker_image,
+                           model_dir='/opt/ml/model',
+                           output_path=output_path,
+                           hyperparameters=hyperparameters,
+                           base_job_name='test-tf',
+                           framework_version='1.11.0',
+                           py_version='py3')
 
     estimator.fit(training_data_path)
 
diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py
@@ -26,12 +26,12 @@ def test_mnist(sagemaker_session, ecr_image, instance_type):
     script = os.path.join(resource_path, 'mnist', 'mnist.py')
     estimator = TensorFlow(entry_point=script,
                            role='SageMakerRole',
-                           training_steps=1,
-                           evaluation_steps=1,
-                           train_instance_count=1,
                            train_instance_type=instance_type,
+                           train_instance_count=1,
                            sagemaker_session=sagemaker_session,
                            image_name=ecr_image,
+                           framework_version='1.11.0',
+                           py_version='py3',
                            base_job_name='test-sagemaker-mnist')
     inputs = estimator.sagemaker_session.upload_data(
         path=os.path.join(resource_path, 'mnist', 'data'),
@@ -46,44 +46,41 @@ def test_distributed_mnist_no_ps(sagemaker_session, ecr_image, instance_type):
     script = os.path.join(resource_path, 'mnist', 'distributed_mnist.py')
     estimator = TensorFlow(entry_point=script,
                            role='SageMakerRole',
-                           training_steps=1,
-                           evaluation_steps=1,
                            train_instance_count=2,
                            train_instance_type=instance_type,
                            sagemaker_session=sagemaker_session,
                            image_name=ecr_image,
+                           framework_version='1.11.0',
+                           py_version='py3',
                            base_job_name='test-tf-sm-distributed-mnist')
     inputs = estimator.sagemaker_session.upload_data(
         path=os.path.join(resource_path, 'mnist', 'data-distributed'),
         key_prefix='scriptmode/mnist-distributed')
     estimator.fit(inputs)
-    _assert_s3_file_exists(os.path.join(estimator.checkpoint_path, 'graph.pbtxt'))
-    _assert_s3_file_exists(os.path.join(estimator.checkpoint_path, 'model.ckpt-0.index'))
-    _assert_s3_file_exists(os.path.join(estimator.checkpoint_path, 'model.ckpt-0.meta'))
+    model_s3_url = estimator.create_model().model_data
+    _assert_s3_file_exists(model_s3_url)
 
 
 def test_distributed_mnist_ps(sagemaker_session, ecr_image, instance_type):
     resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
     script = os.path.join(resource_path, 'mnist', 'distributed_mnist.py')
     estimator = TensorFlow(entry_point=script,
                            role='SageMakerRole',
-                           # training_steps and evaluation_steps are legacy parameters from
-                           # framework mode. These number are not used in the training job.
-                           training_steps=1,
-                           evaluation_steps=1,
                            hyperparameters={SAGEMAKER_PARAMETER_SERVER_ENABLED: True},
                            train_instance_count=2,
                            train_instance_type=instance_type,
                            sagemaker_session=sagemaker_session,
                            image_name=ecr_image,
+                           framework_version='1.11.0',
+                           py_version='py3',
                            base_job_name='test-tf-sm-distributed-mnist')
     inputs = estimator.sagemaker_session.upload_data(
         path=os.path.join(resource_path, 'mnist', 'data-distributed'),
         key_prefix='scriptmode/mnist-distributed')
     estimator.fit(inputs)
-    _assert_s3_file_exists(os.path.join(estimator.checkpoint_path, 'graph.pbtxt'))
-    _assert_s3_file_exists(os.path.join(estimator.checkpoint_path, 'model.ckpt-0.index'))
-    _assert_s3_file_exists(os.path.join(estimator.checkpoint_path, 'model.ckpt-0.meta'))
+    _assert_s3_file_exists(os.path.join(estimator.model_dir, 'graph.pbtxt'))
+    _assert_s3_file_exists(os.path.join(estimator.model_dir, 'model.ckpt-0.index'))
+    _assert_s3_file_exists(os.path.join(estimator.model_dir, 'model.ckpt-0.meta'))
 
 
 def _assert_s3_file_exists(s3_url):
diff --git a/test/resources/mnist/distributed_mnist.py b/test/resources/mnist/distributed_mnist.py
@@ -11,6 +11,7 @@
 from tensorflow.python.platform import tf_logging
 import logging as _logging
 import sys as _sys
+import json
 
 
 def cnn_model_fn(features, labels, mode):
@@ -122,9 +123,8 @@ def _parse_args():
     parser.add_argument('--epochs', type=int, default=1)
     # Data, model, and output directories
     parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
-    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
+    parser.add_argument('--model_dir', type=str, default=os.environ['SM_MODEL_DIR'])
     parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
-    parser.add_argument('--checkpoint_path', type=str, default=os.environ['SM_MODEL_DIR'])
 
     return parser.parse_known_args()
 
@@ -140,8 +140,12 @@ def _parse_args():
     eval_data, eval_labels = _load_testing_data(args.train)
 
     # Create the Estimator
+    if json.loads(os.environ['SM_TRAINING_ENV'])['additional_framework_parameters'].get('sagemaker_parameter_server_enabled'):
+        model_dir = args.model_dir
+    else:
+        model_dir = os.environ['SM_MODEL_DIR']
     mnist_classifier = tf.estimator.Estimator(
-        model_fn=cnn_model_fn, model_dir=args.checkpoint_path)
+        model_fn=cnn_model_fn, model_dir=model_dir)
 
     # Set up logging for predictions
     # Log the values in the "Softmax" tensor with label "probabilities"
diff --git a/test/unit/test_training.py b/test/unit/test_training.py

Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,6 @@ def read(fname):`
`53`	`53`	`'pandas', 'Pillow', 'h5py'],`
`54`	`54`	`extras_require={`
`55`	`55`	`'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock',`
`56`		`- 'sagemaker', 'tensorflow', 'docker-compose']`
	`56`	`+ 'sagemaker>=1.15.2', 'tensorflow', 'docker-compose']`
`57`	`57`	`},`
`58`	`58`	`)`