support nvidia-docker2 natively in local mode. (#426)

iquintero · web-flow · commit c46c5bd2c091 · 2018-10-12T17:05:17.000-07:00
Local Mode wit GPU relied on the environment having
the nvidia runtime as default. This changes that so that
runtime: nvidia is passed in docker-compose.yaml allowing
local mode to work with a vanilla nvidia-docker2 installation.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,8 @@ CHANGELOG
 
 * feature: Local Mode: Add support for Batch Inference
 * feature: Add timestamp to secondary status in training job output
+* enhancement: Local Mode: support nvidia-docker2 natively
+
 
 1.11.2
 ======
diff --git a/setup.py b/setup.py
@@ -44,8 +44,8 @@ def read(fname):
       ],
 
       # Declare minimal set for installation
-      install_requires=['boto3>=1.4.8', 'numpy>=1.9.0', 'protobuf>=3.1', 'scipy>=0.19.0', 'urllib3>=1.2',
-                        'PyYAML>=3.2', 'protobuf3-to-dict>=0.1.5'],
+      install_requires=['boto3>=1.4.8', 'numpy>=1.9.0', 'protobuf>=3.1', 'scipy>=0.19.0', 'urllib3 >=1.21, <1.23',
+                        'PyYAML>=3.2', 'protobuf3-to-dict>=0.1.5', 'docker-compose>=1.21.0'],
 
       extras_require={
           'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist',
diff --git a/src/sagemaker/local/image.py b/src/sagemaker/local/image.py
@@ -362,8 +362,8 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en
         }
 
         content = {
-            # Some legacy hosts only support the 2.1 format.
-            'version': '2.1',
+            # Use version 2.3 as a minimum so that we can specify the runtime
+            'version': '2.3',
             'services': services,
             'networks': {
                 'sagemaker-local': {'name': 'sagemaker-local'}
@@ -415,6 +415,11 @@ def _create_docker_host(self, host, environment, optml_subdirs, command, volumes
             }
         }
 
+        # for GPU support pass in nvidia as the runtime, this is equivalent
+        # to setting --runtime=nvidia in the docker commandline.
+        if self.instance_type == 'local_gpu':
+            host_config['runtime'] = 'nvidia'
+
         if command == 'serve':
             serving_port = sagemaker.utils.get_config_value('local.serving_port',
                                                             self.sagemaker_session.config) or 8080
diff --git a/tests/unit/test_image.py b/tests/unit/test_image.py
@@ -334,6 +334,27 @@ def test_train_local_code(download_folder, _cleanup, popen, _stream_output,
                 assert '%s:/opt/ml/shared' % shared_folder_path in volumes
 
 
+def test_container_has_gpu_support(tmpdir, sagemaker_session):
+    instance_count = 1
+    image = 'my-image'
+    sagemaker_container = _SageMakerContainer('local_gpu', instance_count, image,
+                                              sagemaker_session=sagemaker_session)
+
+    docker_host = sagemaker_container._create_docker_host('host-1', {}, set(), 'train', [])
+    assert 'runtime' in docker_host
+    assert docker_host['runtime'] == 'nvidia'
+
+
+def test_container_does_not_enable_nvidia_docker_for_cpu_containers(tmpdir, sagemaker_session):
+    instance_count = 1
+    image = 'my-image'
+    sagemaker_container = _SageMakerContainer('local', instance_count, image,
+                                              sagemaker_session=sagemaker_session)
+
+    docker_host = sagemaker_container._create_docker_host('host-1', {}, set(), 'train', [])
+    assert 'runtime' not in docker_host
+
+
 @patch('sagemaker.local.image._HostingContainer.run')
 @patch('shutil.copy')
 @patch('shutil.copytree')