Skip to content

Commit 070e5fb

Browse files
authored
Add Horovod tests (#151)
1 parent 8be0efe commit 070e5fb

File tree

5 files changed

+14
-12
lines changed

5 files changed

+14
-12
lines changed

docker/1.12.0/Dockerfile.cpu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
1515
# Install Open MPI
1616
RUN mkdir /tmp/openmpi && \
1717
cd /tmp/openmpi && \
18-
curl -fSsL -O https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
19-
tar zxf openmpi-3.0.0.tar.gz && \
20-
cd openmpi-3.0.0 && \
18+
curl -fSsL -O https://www.open-mpi.org/software/ompi/v3.1/downloads/openmpi-3.1.2.tar.gz && \
19+
tar zxf openmpi-3.1.2.tar.gz && \
20+
cd openmpi-3.1.2 && \
2121
./configure --enable-orterun-prefix-by-default && \
2222
make -j $(nproc) all && \
2323
make install && \
@@ -76,7 +76,7 @@ COPY $framework_support_installable .
7676

7777
RUN pip install --no-cache-dir -U \
7878
keras==2.2.4 \
79-
sagemaker-containers==2.4.0 \
79+
sagemaker-containers==2.4.2 \
8080
$framework_support_installable \
8181
"sagemaker-tensorflow>=1.12,<1.13" && \
8282
# Let's install TensorFlow separately in the end to avoid

docker/1.12.0/Dockerfile.gpu

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthe
2626
libnccl-dev=${NCCL_VERSION} \
2727
libgomp1 \
2828
wget \
29+
openssh-client \
2930
openssh-server \
3031
build-essential && \
3132
# The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0
@@ -50,9 +51,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthe
5051
# Install Open MPI
5152
RUN mkdir /tmp/openmpi && \
5253
cd /tmp/openmpi && \
53-
wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
54-
tar zxf openmpi-3.0.0.tar.gz && \
55-
cd openmpi-3.0.0 && \
54+
curl -fSsL -O https://www.open-mpi.org/software/ompi/v3.1/downloads/openmpi-3.1.2.tar.gz && \
55+
tar zxf openmpi-3.1.2.tar.gz && \
56+
cd openmpi-3.1.2 && \
5657
./configure --enable-orterun-prefix-by-default && \
5758
make -j $(nproc) all && \
5859
make install && \
@@ -111,7 +112,7 @@ COPY $framework_support_installable .
111112

112113
RUN pip install --no-cache-dir -U \
113114
keras==2.2.4 \
114-
sagemaker-containers==2.4.0 \
115+
sagemaker-containers==2.4.2 \
115116
$framework_support_installable \
116117
"sagemaker-tensorflow>=1.12,<1.13" \
117118
# Let's install TensorFlow separately in the end to avoid

test/integration/local/test_horovod.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
2323

2424

25-
@pytest.mark.skip(reason="Horovod feature is not officially launched")
2625
@pytest.mark.parametrize('instances, processes', [
2726
[1, 2],
2827
(2, 1),

test/integration/sagemaker/test_horovod.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,19 @@
1414

1515
import os
1616

17-
import pytest
1817
import sagemaker
1918
from sagemaker.tensorflow import TensorFlow
2019

2120
RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
2221

2322

24-
@pytest.mark.skip(reason="Horovod feature is not officially launched")
2523
def test_distributed_training_horovod(sagemaker_session,
2624
sagemaker_local_session,
2725
instance_type,
2826
ecr_image,
2927
tmpdir):
28+
29+
mpi_options = '-verbose -x orte_base_help_aggregate=0'
3030
estimator = TensorFlow(
3131
entry_point=os.path.join(RESOURCE_PATH, 'mnist', 'horovod_mnist.py'),
3232
role='SageMakerRole',
@@ -37,7 +37,7 @@ def test_distributed_training_horovod(sagemaker_session,
3737
py_version='py3',
3838
script_mode=True,
3939
hyperparameters={'sagemaker_mpi_enabled': True,
40-
'sagemaker_mpi_custom_mpi_options': '-verbose',
40+
'sagemaker_mpi_custom_mpi_options': mpi_options,
4141
'sagemaker_mpi_num_of_processes_per_host': 1})
4242

4343
estimator.fit()

test/resources/mnist/horovod_mnist.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from __future__ import absolute_import, print_function
1414

1515
import os
16+
import subprocess
1617

1718
import keras
1819
from keras.datasets import mnist
@@ -23,6 +24,7 @@
2324
import tensorflow as tf
2425
import horovod.keras as hvd
2526

27+
2628
# Horovod: initialize Horovod.
2729
hvd.init()
2830

0 commit comments

Comments
 (0)