Added: TensorFlow local mode example

djarpin · djarpin · commit 1c464240bbb6 · 2018-04-04T07:49:59.000-07:00
diff --git a/sagemaker-python-sdk/tensorflow_distributed_mnist/daemon.json b/sagemaker-python-sdk/tensorflow_distributed_mnist/daemon.json
@@ -0,0 +1,10 @@
+
+{
+	"default-runtime": "nvidia",
+    "runtimes": {
+        "nvidia": {
+            "path": "/usr/bin/nvidia-container-runtime",
+            "runtimeArgs": []
+        }
+    }
+}
diff --git a/sagemaker-python-sdk/tensorflow_distributed_mnist/setup.sh b/sagemaker-python-sdk/tensorflow_distributed_mnist/setup.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Do we have GPU support?
+nvidia-smi > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+  # check if we have nvidia-docker
+  NVIDIA_DOCKER=`rpm -qa | grep -c nvidia-docker2`
+  if [ $NVIDIA_DOCKER -eq 0 ]; then
+    # Install nvidia-docker2
+    #sudo pkill -SIGHUP dockerd
+    sudo yum -y remove docker
+    sudo yum -y install docker-17.09.1ce-1.111.amzn1
+
+    sudo /etc/init.d/docker start
+
+    curl -s -L https://nvidia.github.io/nvidia-docker/amzn1/nvidia-docker.repo | sudo tee /etc/yum.repos.d/nvidia-docker.repo
+    sudo yum install -y nvidia-docker2
+    sudo cp daemon.json /etc/docker/daemon.json
+    sudo pkill -SIGHUP dockerd
+    echo "installed nvidia-docker2"
+  else
+    echo "nvidia-docker2 already installed. We are good to go!"
+  fi
+fi
+
+# This is common for both GPU and CPU instances
+
+# check if we have docker-compose
+docker-compose version >/dev/null 2>&1
+if [ $? -ne 0 ]; then
+  # install docker compose
+  pip install docker-compose
+fi
+
+# check if we need to configure our docker interface
+SAGEMAKER_NETWORK=`docker network ls | grep -c sagemaker-local`
+if [ $SAGEMAKER_NETWORK -eq 0 ]; then
+  docker network create --driver bridge sagemaker-local
+fi
+
+# Notebook instance Docker networking fixes
+RUNNING_ON_NOTEBOOK_INSTANCE=`sudo iptables -S OUTPUT -t nat | grep -c 169.254.0.2`
+
+# Get the Docker Network CIDR and IP for the sagemaker-local docker interface.
+SAGEMAKER_INTERFACE=br-`docker network ls | grep sagemaker-local | cut -d' ' -f1`
+DOCKER_NET=`ip route | grep $SAGEMAKER_INTERFACE | cut -d" " -f1`
+DOCKER_IP=`ip route | grep $SAGEMAKER_INTERFACE | cut -d" " -f12`
+
+# check if both IPTables and the Route Table are OK.
+IPTABLES_PATCHED=`sudo iptables -S PREROUTING -t nat | grep -c 169.254.0.2`
+ROUTE_TABLE_PATCHED=`sudo ip route show table agent | grep -c $SAGEMAKER_INTERFACE`
+
+if [ $RUNNING_ON_NOTEBOOK_INSTANCE -gt 0 ]; then
+
+  if [ $ROUTE_TABLE_PATCHED -eq 0 ]; then
+    # fix routing
+    sudo ip route add $DOCKER_NET via $DOCKER_IP dev $SAGEMAKER_INTERFACE table agent
+  else
+    echo "SageMaker instance route table setup is ok. We are good to go."
+  fi
+
+  if [ $IPTABLES_PATCHED -eq 0 ]; then
+    sudo iptables -t nat -A PREROUTING  -i $SAGEMAKER_INTERFACE -d 169.254.169.254/32 -p tcp -m tcp --dport 80 -j DNAT --to-destination 169.254.0.2:9081
+    echo "iptables for Docker setup done"
+  else
+    echo "SageMaker instance routing for Docker is ok. We are good to go!"
+  fi
+fi
diff --git a/sagemaker-python-sdk/tensorflow_distributed_mnist/tensorflow_local_mode_mnist.ipynb b/sagemaker-python-sdk/tensorflow_distributed_mnist/tensorflow_local_mode_mnist.ipynb
@@ -0,0 +1,285 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tensorflow MNIST local training  \n",
+    "\n",
+    "## Pre-requisites\n",
+    "\n",
+    "This notebook shows how to use the SageMaker Python SDK to run your code in a local container before deploying to SageMaker's managed training or hosting environments.  This can speed up iterative testing and debugging while using the same familiar Python SDK interface.  Just change your estimator's `train_instance_type` to `local` (or `local_gpu` if you're using an ml.p2 or ml.p3 notebook instance).\n",
+    "\n",
+    "In order to use this feature you'll need to install docker-compose (and nvidia-docker if training with a GPU).\n",
+    "\n",
+    "**Note, you can only run a single local notebook at one time.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!/bin/bash ./setup.sh"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Overview\n",
+    "\n",
+    "The **SageMaker Python SDK** helps you deploy your models for training and hosting in optimized, productions ready containers in SageMaker. The SageMaker Python SDK is easy to use, modular, extensible and compatible with TensorFlow and MXNet. This tutorial focuses on how to create a convolutional neural network model to train the [MNIST dataset](http://yann.lecun.com/exdb/mnist/) using **TensorFlow in local mode**.\n",
+    "\n",
+    "### Set up the environment Set up the environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import subprocess\n",
+    "import sagemaker\n",
+    "from sagemaker import get_execution_role\n",
+    "\n",
+    "sagemaker_session = sagemaker.Session()\n",
+    "\n",
+    "instance_type = 'local'\n",
+    "\n",
+    "if subprocess.call('nvidia-smi') == 0:\n",
+    "    ## Set type to GPU if one is present\n",
+    "    instance_type = 'local_gpu'\n",
+    "    \n",
+    "print(\"Instance type = \" + instance_type)\n",
+    "\n",
+    "role = get_execution_role()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Download the MNIST dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "import utils\n",
+    "from tensorflow.contrib.learn.python.learn.datasets import mnist\n",
+    "import tensorflow as tf\n",
+    "\n",
+    "data_sets = mnist.read_data_sets('data', dtype=tf.uint8, reshape=False, validation_size=5000)\n",
+    "\n",
+    "utils.convert_to(data_sets.train, 'train', 'data')\n",
+    "utils.convert_to(data_sets.validation, 'validation', 'data')\n",
+    "utils.convert_to(data_sets.test, 'test', 'data')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Upload the data\n",
+    "We use the ```sagemaker.Session.upload_data``` function to upload our datasets to an S3 location. The return value inputs identifies the location -- we will use this later when we start the training job."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = sagemaker_session.upload_data(path='data', key_prefix='data/mnist')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Construct a script for training \n",
+    "Here is the full code for the network model:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "!cat 'mnist.py'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The script here is and adaptation of the [TensorFlow MNIST example](https://github.com/tensorflow/models/tree/master/official/mnist). It provides a ```model_fn(features, labels, mode)```, which is used for training, evaluation and inference. \n",
+    "\n",
+    "## A regular ```model_fn```\n",
+    "\n",
+    "A regular **```model_fn```** follows the pattern:\n",
+    "1. [defines a neural network](https://github.com/tensorflow/models/blob/master/official/mnist/mnist.py#L96)\n",
+    "- [applies the ```features``` in the neural network](https://github.com/tensorflow/models/blob/master/official/mnist/mnist.py#L178)\n",
+    "- [if the ```mode``` is ```PREDICT```, returns the output from the neural network](https://github.com/tensorflow/models/blob/master/official/mnist/mnist.py#L186)\n",
+    "- [calculates the loss function comparing the output with the ```labels```](https://github.com/tensorflow/models/blob/master/official/mnist/mnist.py#L188)\n",
+    "- [creates an optimizer and minimizes the loss function to improve the neural network](https://github.com/tensorflow/models/blob/master/official/mnist/mnist.py#L193)\n",
+    "- [returns the output, optimizer and loss function](https://github.com/tensorflow/models/blob/master/official/mnist/mnist.py#L205)\n",
+    "\n",
+    "## Writing a ```model_fn``` for distributed training\n",
+    "When distributed training happens, the same neural network will be sent to the multiple training instances. Each instance will predict a batch of the dataset, calculate loss and minimize the optimizer. One entire loop of this process is called **training step**.\n",
+    "\n",
+    "### Syncronizing training steps\n",
+    "A [global step](https://www.tensorflow.org/api_docs/python/tf/train/global_step) is a global variable shared between the instances. It necessary for distributed training, so the optimizer will keep track of the number of **training steps** between runs: \n",
+    "\n",
+    "```python\n",
+    "train_op = optimizer.minimize(loss, tf.train.get_or_create_global_step())\n",
+    "```\n",
+    "\n",
+    "That is the only required change for distributed training!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create a training job using the sagemaker.TensorFlow estimator\n",
+    "\n",
+    "The `TensorFlow` class allows us to run our training function on SageMaker. We need to configure it with our training script, an IAM role, the number of training instances, and the training instance type.  Here is the the only difference from [tensorflow_distributed_mnist.ipynb](./tensorflow_distributed_mnist.ipynb) is that instead of ``train_instance_type='ml.c4.xlarge'``, we set it to ``train_instance_type='local'``.  For local training with GPU, we could set this to \"local_gpu\".  In this case, `instance_type` was set above based on your whether you're running a GPU instance.\n",
+    "\n",
+    "After we've constructed our `TensorFlow` object, we fit it using the data we uploaded to S3. Even though we're in local mode, using S3 as our data source makes sense because it maintains consistency with how SageMaker's distributed, managed training ingests data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "from sagemaker.tensorflow import TensorFlow\n",
+    "\n",
+    "mnist_estimator = TensorFlow(entry_point='mnist.py',\n",
+    "                             role=role,\n",
+    "                             training_steps=10, \n",
+    "                             evaluation_steps=10,\n",
+    "                             train_instance_count=1,\n",
+    "                             train_instance_type=instance_type)\n",
+    "\n",
+    "mnist_estimator.fit(inputs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The **```fit```** method will create a training job in two **ml.c4.xlarge** instances. The logs above will show the instances doing training, evaluation, and incrementing the number of **training steps**. \n",
+    "\n",
+    "In the end of the training, the training job will generate a saved model for TF serving."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "# Deploy the trained model to prepare for predictions\n",
+    "\n",
+    "The deploy() method creates an endpoint (in this case locally) which serves prediction requests in real-time."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mnist_predictor = mnist_estimator.deploy(initial_instance_count=1,\n",
+    "                                             instance_type=instance_type)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Invoking the endpoint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from tensorflow.examples.tutorials.mnist import input_data\n",
+    "\n",
+    "mnist = input_data.read_data_sets(\"/tmp/data/\", one_hot=True)\n",
+    "\n",
+    "for i in range(10):\n",
+    "    data = mnist.test.images[i].tolist()\n",
+    "    tensor_proto = tf.make_tensor_proto(values=np.asarray(data), shape=[1, len(data)], dtype=tf.float32)\n",
+    "    predict_response = mnist_predictor.predict(tensor_proto)\n",
+    "    \n",
+    "    print(\"========================================\")\n",
+    "    label = np.argmax(mnist.test.labels[i])\n",
+    "    print(\"label is {}\".format(label))\n",
+    "    prediction = predict_response['outputs']['classes']['int64Val'][0]\n",
+    "    print(\"prediction is {}\".format(prediction))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Clean-up\n",
+    "\n",
+    "Deleting the local endpoint when you're finished is important since you can only run one local endpoint at a time."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mnist_estimator.delete_endpoint()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "conda_tensorflow_p27",
+   "language": "python",
+   "name": "conda_tensorflow_p27"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.14"
+  },
+  "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.  Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}