aws · nikhil-sk · Sep 9, 2019 · Aug 22, 2019 · Aug 26, 2019 · Aug 29, 2019
diff --git a/buildspec.yml b/buildspec.yml
@@ -2,10 +2,12 @@ version: 0.2
 
 env:
   variables:
-    FRAMEWORK_VERSION: '1.1.0'
-    CPU_PY_VERSION: '2'
+    FRAMEWORK_VERSION: '1.2.0'
+    CPU_PY2_VERSION: '2'
+    CPU_PY3_VERSION: '3'
     CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
-    GPU_PY_VERSION: '3'
+    GPU_PY2_VERSION: '2'
+    GPU_PY3_VERSION: '3'
     GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
     LOCAL_BASE_REPO: 'pytorch-base'
     ECR_REPO: 'sagemaker-test'
@@ -33,39 +35,26 @@ phases:
       # run unit tests
       - pytest test/unit
 
-      # build cpu base image
-      - base_dir="docker/$FRAMEWORK_VERSION/base"
-      - cpu_py2_base_tag="$FRAMEWORK_VERSION-cpu-py2"
-      - cpu_py3_base_tag="$FRAMEWORK_VERSION-cpu-py3"
+      # create wheel in dist/
+      - python3 setup.py bdist_wheel
+      - whl_name=$(ls dist/sagemaker_pytorch_serving_container*.whl)
+      - cp $whl_name sagemaker_pytorch_serving_container-1.2-py2.py3-none-any.whl
+
       - cpu_dockerfile="Dockerfile.cpu"
-      - cd $base_dir
-      - docker build -t $LOCAL_BASE_REPO:$cpu_py2_base_tag -f $cpu_dockerfile --build-arg py_version=2 .
-      - docker build -t $LOCAL_BASE_REPO:$cpu_py3_base_tag -f $cpu_dockerfile --build-arg py_version=3 .
-      - cd ../../../
-
-      # build gpu base image
-      - gpu_py2_base_tag="$FRAMEWORK_VERSION-gpu-py2"
-      - gpu_py3_base_tag="$FRAMEWORK_VERSION-gpu-py3"
       - gpu_dockerfile="Dockerfile.gpu"
-      - cd $base_dir
-      - docker build -t $LOCAL_BASE_REPO:$gpu_py2_base_tag -f $gpu_dockerfile --build-arg py_version=2 .
-      - docker build -t $LOCAL_BASE_REPO:$gpu_py3_base_tag -f $gpu_dockerfile --build-arg py_version=3 .
-      - cd ../../../
 
-      # create wheel
-      - python3 setup.py bdist_wheel
-
-      # build cpu image
-      - build_dir="docker/$FRAMEWORK_VERSION/final"
+      # build py2 images
+      - build_dir="docker/$FRAMEWORK_VERSION/py$CPU_PY2_VERSION"
       - CPU_PY2_TAG="$FRAMEWORK_VERSION-cpu-py2-$BUILD_ID"
-      - CPU_PY3_TAG="$FRAMEWORK_VERSION-cpu-py3-$BUILD_ID"
+      - GPU_PY2_TAG="$FRAMEWORK_VERSION-gpu-py2-$BUILD_ID"
       - docker build -f "$build_dir/$cpu_dockerfile" --build-arg py_version=2 -t $PREPROD_IMAGE:$CPU_PY2_TAG .
-      - docker build -f "$build_dir/$cpu_dockerfile" --build-arg py_version=3 -t $PREPROD_IMAGE:$CPU_PY3_TAG .
+      - docker build -f "$build_dir/$gpu_dockerfile" --build-arg py_version=2 -t $PREPROD_IMAGE:$GPU_PY2_TAG .
 
-      # build gpu image
-      - GPU_PY2_TAG="$FRAMEWORK_VERSION-gpu-py2-$BUILD_ID"
+      # build py3 image
+      - build_dir="docker/$FRAMEWORK_VERSION/py$GPU_PY3_VERSION"
+      - CPU_PY3_TAG="$FRAMEWORK_VERSION-cpu-py3-$BUILD_ID"
       - GPU_PY3_TAG="$FRAMEWORK_VERSION-gpu-py3-$BUILD_ID"
-      - docker build -f "$build_dir/$gpu_dockerfile" --build-arg py_version=2 -t $PREPROD_IMAGE:$GPU_PY2_TAG .
+      - docker build -f "$build_dir/$gpu_dockerfile" --build-arg py_version=3 -t $PREPROD_IMAGE:$CPU_PY3_TAG .
       - docker build -f "$build_dir/$gpu_dockerfile" --build-arg py_version=3 -t $PREPROD_IMAGE:$GPU_PY3_TAG .
 
       # push images to ecr
@@ -84,8 +73,8 @@ phases:
       # run cpu integration tests
       - |
         if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then
-          pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY_VERSION --processor cpu --tag $CPU_PY3_TAG
-          pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY_VERSION --processor cpu --tag $CPU_PY2_TAG
+          pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY3_VERSION --processor cpu --tag $CPU_PY3_TAG
+          pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY2_VERSION --processor cpu --tag $CPU_PY2_TAG
         else
           echo "skipping cpu integration tests"
         fi
@@ -94,8 +83,8 @@ phases:
       - |
         if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then
           printf "$SETUP_CMDS" > $SETUP_FILE
-          py3_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --framework-version $FRAMEWORK_VERSION --py-version $GPU_PY_VERSION --processor gpu --tag $GPU_PY3_TAG"
-          py3_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --framework-version $FRAMEWORK_VERSION --py-version $GPU_PY_VERSION --processor gpu --tag $GPU_PY2_TAG"
+          py3_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --framework-version $FRAMEWORK_VERSION --py-version $GPU_PY3_VERSION --processor gpu --tag $GPU_PY3_TAG"
+          py3_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --framework-version $FRAMEWORK_VERSION --py-version $GPU_PY2_VERSION --processor gpu --tag $GPU_PY2_TAG"
           remote-test --github-repo $GITHUB_REPO --test-cmd "$py3_cmd" --setup-file $SETUP_FILE --pr-number "$PR_NUM"
         else
           echo "skipping gpu integration tests"
@@ -104,17 +93,17 @@ phases:
       # run cpu sagemaker tests
       - |
         if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then
-          pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $CPU_PY3_TAG
-          pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $CPU_PY2_TAG
+          pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY3_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $CPU_PY3_TAG
+          pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY2_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $CPU_PY2_TAG
         else
           echo "skipping cpu sagemaker tests"
         fi
 
       # run gpu sagemaker tests
       - |
         if has-matching-changes "test/" "tests/" "src/*.py" "docker/*" "buildspec.yml"; then
-          pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --py-version $GPU_PY_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GPU_PY3_TAG
-          pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GPU_PY2_TAG
+          pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --py-version $GPU_PY3_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GPU_PY3_TAG
+          pytest test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY2_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GPU_PY2_TAG
         else
           echo "skipping gpu sagemaker tests"
         fi

diff --git a/docker/1.2.0/py2/Dockerfile.cpu b/docker/1.2.0/py2/Dockerfile.cpu
@@ -0,0 +1,78 @@
+FROM ubuntu:16.04
+
+LABEL maintainer="Amazon AI"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ca-certificates \
+    cmake \
+    curl \
+    git \
+    wget \
+    vim \
+    jq \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    build-essential \
+    zlib1g-dev \
+    libglib2.0-0 \
+    libgl1-mesa-glx \
+    openjdk-8-jdk-headless
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+# Add arguments to achieve the version, python and url
+ARG PYTHON_VERSION=2.7
+
+RUN curl -o ~/miniconda.sh -O  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
+     chmod +x ~/miniconda.sh && \
+     ~/miniconda.sh -b -p /opt/conda && \
+     rm ~/miniconda.sh && \
+     /opt/conda/bin/conda update conda && \
+     /opt/conda/bin/conda install -y python=$PYTHON_VERSION \
+     numpy==1.16.4 \
+     scipy==1.2.1 \
+     ipython==5.8.0 \
+     mkl==2019.4 \
+     mkl-include==2019.4 \
+     cython==0.29.12 \
+     typing==3.7.4 && \
+     /opt/conda/bin/conda clean -ya
+ENV PATH /opt/conda/bin:$PATH
+
+ARG PYTORCH_VERSION=1.2.0
+ARG TORCHVISION_VERSION=0.4.0
+ARG MMS_VERSION=1.0.5
+RUN conda install -c conda-forge awscli==1.16.210 opencv==4.0.1 && \
+    conda install -y scikit-learn==0.20.3 \
+                     pandas==0.24.2 \
+                     pillow==6.1.0 \
+                     h5py==2.9.0 \
+                     requests==2.22.0 && \
+    conda install pytorch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION cpuonly -c pytorch && \
+    conda clean -ya && \
+    pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org && \
+    pip install --pre -U mxnet-model-server
+
+RUN useradd -m model-server \
+    && mkdir -p /home/model-server/tmp \
+    && chown -R model-server /home/model-server
+
+COPY docker/$PYTORCH_VERSION/py2/mms-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
+COPY docker/$PYTORCH_VERSION/py2/config.properties /home/model-server
+COPY src/sagemaker_pytorch_serving_container/deep_learning_container.py /usr/local/bin/deep_learning_container.py
+
+RUN chmod +x /usr/local/bin/dockerd-entrypoint.py && \
+    chmod +x /usr/local/bin/deep_learning_container.py
+
+COPY dist/sagemaker_pytorch_serving_container-1.2-py2.py3-none-any.whl /sagemaker_pytorch_serving_container-1.2-py2.py3-none-any.whl
+RUN pip install --no-cache-dir /sagemaker_pytorch_serving_container-1.2-py2.py3-none-any.whl && \
+    rm /sagemaker_pytorch_serving_container-1.2-py2.py3-none-any.whl
+
+ENV SAGEMAKER_SERVING_MODULE sagemaker_pytorch_serving_container.serving:main
+
+EXPOSE 8080 8081
+ENV TEMP=/home/model-server/tmp
+ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
+CMD ["mxnet-model-server", "--start", "--mms-config", "/home/model-server/config.properties"]
diff --git a/docker/1.2.0/py2/Dockerfile.gpu b/docker/1.2.0/py2/Dockerfile.gpu
@@ -0,0 +1,105 @@
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
+# NCCL_VERSION=2.4.7, CUDNN_VERSION=7.6.2.24
+LABEL maintainer="Amazon AI"
+
+RUN apt-get update && apt-get install -y  --allow-downgrades --allow-change-held-packages --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        cmake \
+        cuda-command-line-tools-10-0 \
+        cuda-cublas-10-0 \
+        cuda-cufft-10-0 \
+        cuda-curand-10-0 \
+        cuda-cusolver-10-0 \
+        cuda-cusparse-10-0 \
+        libgomp1 \
+        libibverbs-dev \
+        curl \
+        git \
+        wget \
+        vim \
+        jq \
+        libsm6 \
+        libxext6 \
+        libxrender-dev \
+        build-essential \
+        zlib1g-dev \
+        libglib2.0-0 \
+        libgl1-mesa-glx \
+        openjdk-8-jdk-headless
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+# Add arguments to achieve the version, python and url
+ARG PYTHON_VERSION=2.7
+
+# Install OpenSSH, Allow OpenSSH to talk to containers without asking for confirmation
+RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
+    mkdir -p /var/run/sshd && \
+    cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+RUN curl -o ~/miniconda.sh -O  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
+    chmod +x ~/miniconda.sh && \
+    ~/miniconda.sh -b -p /opt/conda && \
+    rm ~/miniconda.sh && \
+    /opt/conda/bin/conda update conda && \
+    /opt/conda/bin/conda install -y python=$PYTHON_VERSION \
+	                              numpy==1.16.4 \
+	                              scipy==1.2.1 \
+	                              ipython==5.8.0 \
+	                              mkl==2019.4 \
+	                              mkl-include==2019.4 \
+	                              cython==0.29.12 \
+	                              typing==3.7.4 && \
+	 /opt/conda/bin/conda clean -ya
+ENV PATH /opt/conda/bin:$PATH
+
+ARG PYTORCH_VERSION=1.2.0
+ARG TORCHVISION_VERSION=0.4.0
+ARG MMS_VERSION=1.0.5
+RUN conda install -c pytorch magma-cuda100 && \
+    conda install -c conda-forge awscli==1.16.210 opencv==4.0.1 && \
+    conda install -y scikit-learn==0.20.3 \
+                     pandas==0.24.2 \
+                     pillow==6.1.0 \
+                     h5py==2.9.0 \
+                     requests==2.22.0 && \
+    conda install pytorch==$PYTORCH_VERSION torchvision==$TORCHVISION_VERSION cudatoolkit=10.0 -c pytorch && \
+    conda clean -ya && \
+    /opt/conda/bin/conda config --set ssl_verify False && \
+    pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org && \
+    pip install --pre -U mxnet-model-server
+
+RUN useradd -m model-server \
+    && mkdir -p /home/model-server/tmp \
+    && chown -R model-server /home/model-server
+
+COPY docker/$PYTORCH_VERSION/py2/mms-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
+COPY docker/$PYTORCH_VERSION/py2/config.properties /home/model-server
+COPY src/sagemaker_pytorch_serving_container/deep_learning_container.py /usr/local/bin/deep_learning_container.py
+
+RUN chmod +x /usr/local/bin/dockerd-entrypoint.py && \
+    chmod +x /usr/local/bin/deep_learning_container.py
+
+# Install OpenSSH for MPI to communicate between containers, Allow OpenSSH to talk to containers without asking for confirmation
+RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
+    mkdir -p /var/run/sshd && \
+    cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+# RUN pip install --no-cache-dir 'opencv-python>=4.0,<4.1'
+
+COPY dist/sagemaker_pytorch_serving_container-1.2-py2.py3-none-any.whl /sagemaker_pytorch_serving_container-1.2-py2.py3-none-any.whl
+RUN pip install --no-cache-dir /sagemaker_pytorch_serving_container-1.2-py2.py3-none-any.whl && \
+    rm /sagemaker_pytorch_serving_container-1.2-py2.py3-none-any.whl
+
+ENV SAGEMAKER_SERVING_MODULE sagemaker_pytorch_serving_container.serving:main
+
+EXPOSE 8080 8081
+ENV TEMP=/home/model-server/tmp
+ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
+CMD ["mxnet-model-server", "--start", "--mms-config", "/home/model-server/config.properties"]
diff --git a/docker/1.2.0/py2/config.properties b/docker/1.2.0/py2/config.properties
@@ -0,0 +1,26 @@
+vmargs=-Xmx128m -XX:-UseLargePages -XX:+UseG1GC -XX:MaxMetaspaceSize=32M -XX:MaxDirectMemorySize=10m -XX:+ExitOnOutOfMemoryError
+model_store=/opt/ml/model
+load_models=ALL
+inference_address=http://0.0.0.0:8080
+management_address=http://0.0.0.0:8081
+# management_address=unix:/tmp/management.sock
+# number_of_netty_threads=0
+# netty_client_threads=0
+# default_response_timeout=120
+# default_workers_per_model=0
+# job_queue_size=100
+# async_logging=false
+# number_of_gpu=1
+# cors_allowed_origin
+# cors_allowed_methods
+# cors_allowed_headers
+# keystore=src/test/resources/keystore.p12
+# keystore_pass=changeit
+# keystore_type=PKCS12
+# private_key_file=src/test/resources/key.pem
+# certificate_file=src/test/resources/certs.pem
+# max_response_size=6553500
+# max_request_size=6553500
+# blacklist_env_vars=
+# decode_input_request=false
+# enable_envvars_config=false
diff --git a/docker/1.2.0/py2/mms-entrypoint.py b/docker/1.2.0/py2/mms-entrypoint.py
@@ -0,0 +1,17 @@
+import shlex
+import subprocess
+import sys
+import os.path
+
+from sagemaker_pytorch_serving_container import serving
+
+if not os.path.exists("/opt/ml/input/config"):
+    subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&'])
+
+if sys.argv[1] == 'serve':
+    serving.main()
+else:
+    subprocess.check_call(shlex.split(' '.join(sys.argv[1:])))
+
+# prevent docker exit
+subprocess.call(['tail', '-f', '/dev/null'])