Skip to content

Commit c46c5bd

Browse files
authored
support nvidia-docker2 natively in local mode. (#426)
Local Mode wit GPU relied on the environment having the nvidia runtime as default. This changes that so that runtime: nvidia is passed in docker-compose.yaml allowing local mode to work with a vanilla nvidia-docker2 installation.
1 parent eea56fc commit c46c5bd

File tree

4 files changed

+32
-4
lines changed

4 files changed

+32
-4
lines changed

CHANGELOG.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ CHANGELOG
77

88
* feature: Local Mode: Add support for Batch Inference
99
* feature: Add timestamp to secondary status in training job output
10+
* enhancement: Local Mode: support nvidia-docker2 natively
11+
1012

1113
1.11.2
1214
======

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ def read(fname):
4444
],
4545

4646
# Declare minimal set for installation
47-
install_requires=['boto3>=1.4.8', 'numpy>=1.9.0', 'protobuf>=3.1', 'scipy>=0.19.0', 'urllib3>=1.2',
48-
'PyYAML>=3.2', 'protobuf3-to-dict>=0.1.5'],
47+
install_requires=['boto3>=1.4.8', 'numpy>=1.9.0', 'protobuf>=3.1', 'scipy>=0.19.0', 'urllib3 >=1.21, <1.23',
48+
'PyYAML>=3.2', 'protobuf3-to-dict>=0.1.5', 'docker-compose>=1.21.0'],
4949

5050
extras_require={
5151
'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist',

src/sagemaker/local/image.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -362,8 +362,8 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en
362362
}
363363

364364
content = {
365-
# Some legacy hosts only support the 2.1 format.
366-
'version': '2.1',
365+
# Use version 2.3 as a minimum so that we can specify the runtime
366+
'version': '2.3',
367367
'services': services,
368368
'networks': {
369369
'sagemaker-local': {'name': 'sagemaker-local'}
@@ -415,6 +415,11 @@ def _create_docker_host(self, host, environment, optml_subdirs, command, volumes
415415
}
416416
}
417417

418+
# for GPU support pass in nvidia as the runtime, this is equivalent
419+
# to setting --runtime=nvidia in the docker commandline.
420+
if self.instance_type == 'local_gpu':
421+
host_config['runtime'] = 'nvidia'
422+
418423
if command == 'serve':
419424
serving_port = sagemaker.utils.get_config_value('local.serving_port',
420425
self.sagemaker_session.config) or 8080

tests/unit/test_image.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,27 @@ def test_train_local_code(download_folder, _cleanup, popen, _stream_output,
334334
assert '%s:/opt/ml/shared' % shared_folder_path in volumes
335335

336336

337+
def test_container_has_gpu_support(tmpdir, sagemaker_session):
338+
instance_count = 1
339+
image = 'my-image'
340+
sagemaker_container = _SageMakerContainer('local_gpu', instance_count, image,
341+
sagemaker_session=sagemaker_session)
342+
343+
docker_host = sagemaker_container._create_docker_host('host-1', {}, set(), 'train', [])
344+
assert 'runtime' in docker_host
345+
assert docker_host['runtime'] == 'nvidia'
346+
347+
348+
def test_container_does_not_enable_nvidia_docker_for_cpu_containers(tmpdir, sagemaker_session):
349+
instance_count = 1
350+
image = 'my-image'
351+
sagemaker_container = _SageMakerContainer('local', instance_count, image,
352+
sagemaker_session=sagemaker_session)
353+
354+
docker_host = sagemaker_container._create_docker_host('host-1', {}, set(), 'train', [])
355+
assert 'runtime' not in docker_host
356+
357+
337358
@patch('sagemaker.local.image._HostingContainer.run')
338359
@patch('shutil.copy')
339360
@patch('shutil.copytree')

0 commit comments

Comments
 (0)