Skip to content

Commit cb2a421

Browse files
Maskrcnn new features and support for issues (#1078)
* add support for inference fixes issue 1028 * clean up notebook * add environment variable for pre-trained model * add support for inference and writing logs to shared file systems * fix default model name * document updates * create log output directory on EFS file system * add new notebook for experiment trials
1 parent a32a4b8 commit cb2a421

File tree

23 files changed

+2633
-34
lines changed

23 files changed

+2633
-34
lines changed

advanced_functionality/distributed_tensorflow_mask_rcnn/container-optimized/resources/train.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import json
22
import os
3+
import shutil
34
import subprocess
45
import sys
56
import time
@@ -114,13 +115,29 @@ def build_host_arg(host_list, gpu_per_host):
114115
arg += f'{host}:{gpu_per_host}'
115116
return arg
116117

117-
118+
def copy_files(src, dest):
119+
src_files = os.listdir(src)
120+
for file in src_files:
121+
path = os.path.join(src, file)
122+
if os.path.isfile(path):
123+
shutil.copy(path, dest)
124+
118125
def train():
119126

120127
import pprint
121128
pprint.pprint(dict(os.environ), width = 1)
122129

123130
model_dir = os.environ['SM_MODEL_DIR']
131+
log_dir = None
132+
133+
copy_logs_to_model_dir = False
134+
135+
try:
136+
log_dir = os.environ['SM_CHANNEL_LOG']
137+
copy_logs_to_model_dir = True
138+
except KeyError:
139+
log_dir = model_dir
140+
124141
train_data_dir = os.environ['SM_CHANNEL_TRAIN']
125142

126143
print("pre-setup check")
@@ -203,6 +220,20 @@ def train():
203220
except KeyError:
204221
images_per_epoch = 120000
205222

223+
try:
224+
backbone_weights = hyperparamters['backbone_weights']
225+
except KeyError:
226+
backbone_weights = 'ImageNet-R50-AlignPadding.npz'
227+
228+
try:
229+
resnet_arch = hyperparamters['resnet_arch']
230+
except KeyError:
231+
resnet_arch = 'resnet50'
232+
233+
resnet_num_blocks = '[3, 4, 6, 3]'
234+
if resnet_arch == 'resnet101':
235+
resnet_num_blocks = '[3, 4, 23, 3]'
236+
206237
gpus_per_host = int(os.environ['SM_NUM_GPUS'])
207238
numprocesses = len(all_hosts) * int(gpus_per_host)
208239

@@ -226,15 +257,16 @@ def train():
226257
-x LD_LIBRARY_PATH -x PATH \\
227258
--output-filename {model_dir} \\
228259
/usr/local/bin/python3.6 /mask-rcnn-tensorflow/MaskRCNN/train.py \
229-
--logdir {model_dir} \
260+
--logdir {log_dir} \
230261
--fp16 \
231262
--throughput_log_freq=2000 \
232263
--images_per_epoch {images_per_epoch} \
233264
--config \
234265
MODE_FPN={mode_fpn} \
235266
MODE_MASK={mode_mask} \
236267
DATA.BASEDIR={train_data_dir} \
237-
BACKBONE.WEIGHTS={train_data_dir}/pretrained-models/ImageNet-R50-AlignPadding.npz \
268+
BACKBONE.RESNET_NUM_BLOCKS='{resnet_num_blocks}' \
269+
BACKBONE.WEIGHTS={train_data_dir}/pretrained-models/{backbone_weights} \
238270
BACKBONE.NORM={batch_norm} \
239271
DATA.TRAIN='["{data_train}"]' \
240272
DATA.VAL='("{data_val}",)' \
@@ -269,6 +301,9 @@ def train():
269301
print("train exception occured", file=sys.stderr)
270302
exitcode = 1
271303
print(str(e), file=sys.stderr)
304+
finally:
305+
if copy_logs_to_model_dir:
306+
copy_files(log_dir, model_dir)
272307

273308
sys.stdout.flush()
274309
sys.stderr.flush()
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:1.13-horovod-gpu-py36-cu100-ubuntu16.04-v2.0
2+
3+
ENV HOROVOD_VERSION=0.18.1
4+
5+
RUN pip install --upgrade pip
6+
7+
# Need to reinstall some libraries the DL container provides due to custom Tensorflow binary
8+
RUN pip uninstall -y tensorflow tensorboard tensorflow-estimator keras h5py horovod numpy
9+
10+
# Download and install custom Tensorflow binary
11+
RUN wget https://github.com/aws-samples/mask-rcnn-tensorflow/releases/download/v0.0.0/tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl && \
12+
pip install tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl && \
13+
pip install tensorflow-estimator==1.13.0 && \
14+
rm tensorflow-1.13.0-cp36-cp36m-linux_x86_64.whl
15+
16+
# Install Horovod, temporarily using CUDA stubs
17+
RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs && \
18+
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==${HOROVOD_VERSION} && \
19+
ldconfig
20+
21+
# Install OpenSSH for MPI to communicate between containers
22+
RUN apt-get install -y --no-install-recommends openssh-client openssh-server
23+
RUN mkdir -p /var/run/sshd && \
24+
sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
25+
26+
RUN rm -rf /root/.ssh/ && \
27+
mkdir -p /root/.ssh/ && \
28+
ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
29+
cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
30+
printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
31+
32+
RUN pip install awscli
33+
RUN pip install boto3
34+
RUN pip install ujson==1.35
35+
RUN pip install opencv-python==4.1.0.25
36+
RUN pip install Cython==0.28.4
37+
RUN pip install pycocotools==2.0.0
38+
RUN pip install matplotlib==3.0.3
39+
RUN pip install markdown==3.1
40+
RUN pip install numpy==1.17.5
41+
42+
RUN git clone https://github.com/aws-samples/mask-rcnn-tensorflow
43+
44+
RUN chmod -R +w /mask-rcnn-tensorflow
45+
RUN pip install -e /mask-rcnn-tensorflow/
46+
47+
##########################################################################################
48+
# SageMaker requirements
49+
##########################################################################################
50+
## install flask
51+
RUN pip install flask
52+
53+
### Install nginx notebook
54+
RUN apt-get -y update && apt-get install -y --no-install-recommends \
55+
wget \
56+
nginx \
57+
ca-certificates \
58+
&& rm -rf /var/lib/apt/lists/*
59+
60+
COPY resources/*.* /
61+
ENV WORKDIR /
62+
63+
ENTRYPOINT ["python", "/serve.py"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#!/usr/bin/env bash
2+
3+
# This script shows how to build the Docker image and push it to ECR to be ready for use
4+
# by SageMaker.
5+
6+
# The argument to this script is the image name. This will be used as the image on the local
7+
# machine and combined with the account and region to form the repository name for ECR.
8+
9+
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
10+
source $DIR/set_env.sh
11+
12+
# set region
13+
region=
14+
if [ "$#" -eq 1 ]; then
15+
region=$1
16+
else
17+
echo "usage: $0 <aws-region>"
18+
exit 1
19+
fi
20+
21+
22+
image=$IMAGE_NAME
23+
tag=$IMAGE_TAG
24+
25+
# Get the account number associated with the current IAM credentials
26+
account=$(aws sts get-caller-identity --query Account --output text)
27+
28+
if [ $? -ne 0 ]
29+
then
30+
exit 255
31+
fi
32+
33+
34+
fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image}:${tag}"
35+
36+
# If the repository doesn't exist in ECR, create it.
37+
aws ecr describe-repositories --region ${region} --repository-names "${image}" > /dev/null 2>&1
38+
if [ $? -ne 0 ]; then
39+
aws ecr create-repository --region ${region} --repository-name "${image}" > /dev/null
40+
fi
41+
42+
43+
# Build the docker image locally with the image name and then push it to ECR
44+
# with the full name.
45+
46+
# Get the login command from ECR and execute it directly
47+
$(aws ecr get-login --no-include-email --region us-west-2 --registry-ids 763104351884)
48+
49+
docker build -t ${image} $DIR/..
50+
docker tag ${image} ${fullname}
51+
52+
# Get the login command from ECR and execute it directly
53+
$(aws ecr get-login --region ${region} --no-include-email)
54+
docker push ${fullname}
55+
if [ $? -eq 0 ]; then
56+
echo "Amazon ECR URI: ${fullname}"
57+
else
58+
echo "Error: Image build and push failed"
59+
exit 1
60+
fi
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/usr/bin/env bash
2+
3+
export IMAGE_NAME=mask-rcnn-tensorflow-serving-sagemaker
4+
export IMAGE_TAG=tf1.13-153442b
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
worker_processes 1;
2+
daemon off; # Prevent forking
3+
4+
5+
pid /tmp/nginx.pid;
6+
error_log /var/log/nginx/error.log;
7+
8+
events {
9+
# defaults
10+
}
11+
12+
http {
13+
include /etc/nginx/mime.types;
14+
default_type application/octet-stream;
15+
access_log /var/log/nginx/access.log combined;
16+
17+
upstream gunicorn {
18+
server unix:/tmp/gunicorn.sock;
19+
}
20+
21+
server {
22+
listen 0.0.0.0:8080 deferred;
23+
client_max_body_size 5m;
24+
25+
keepalive_timeout 70;
26+
proxy_read_timeout 1200s;
27+
28+
location ~ ^/(ping|invocations) {
29+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
30+
proxy_set_header Host $http_host;
31+
proxy_redirect off;
32+
proxy_pass http://gunicorn;
33+
}
34+
35+
location / {
36+
return 404 "{}";
37+
}
38+
}
39+
}

0 commit comments

Comments
 (0)