Skip to content

Fixing Debugger BYOC example to use pySDK v1 #1567

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Sep 25, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
"outputs": [],
"source": [
"import sys\n",
"!{sys.executable} -m pip install \"sagemaker>=2.0.0\" smdebug"
"!{sys.executable} -m pip install \"sagemaker==1.72.0\" smdebug"
]
},
{
Expand All @@ -78,20 +78,9 @@
},
{
"cell_type": "code",
"execution_count": 329,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'2.5.0'"
]
},
"execution_count": 329,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"import sagemaker\n",
"sagemaker.__version__"
Expand All @@ -117,7 +106,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {
"scrolled": true
},
Expand All @@ -126,22 +115,22 @@
"name": "stdout",
"output_type": "stream",
"text": [
"FROM tensorflow/tensorflow:2.2.0rc2-py3-jupyter\n",
"\n",
"# Install Amazon SageMaker Python SDK liabrary for training and smdebug\n",
"RUN pip install sagemaker-training\n",
"RUN pip install smdebug\n",
"\n",
"# Copies the training code inside the container\n",
"COPY tf_keras_resnet_byoc.py /opt/ml/code/tf_keras_resnet_byoc.py\n",
"\n",
"# Defines train.py as script entrypoint\n",
"ENV SAGEMAKER_PROGRAM tf_keras_resnet_byoc.py"
"\u001b[34mFROM\u001b[39;49;00m \u001b[33mtensorflow/tensorflow:2.2.0rc2-py3-jupyter\u001b[39;49;00m\r\n",
"\r\n",
"\u001b[37m# Install Amazon SageMaker training toolkit and smdebug libraries\u001b[39;49;00m\r\n",
"\u001b[34mRUN\u001b[39;49;00m pip install sagemaker-training\r\n",
"\u001b[34mRUN\u001b[39;49;00m pip install smdebug\r\n",
"\r\n",
"\u001b[37m# Copies the training code inside the container\u001b[39;49;00m\r\n",
"\u001b[34mCOPY\u001b[39;49;00m tf_keras_resnet_byoc.py /opt/ml/code/tf_keras_resnet_byoc.py\r\n",
"\r\n",
"\u001b[37m# Defines train.py as script entrypoint\u001b[39;49;00m\r\n",
"\u001b[34mENV\u001b[39;49;00m SAGEMAKER_PROGRAM tf_keras_resnet_byoc.py\r\n"
]
}
],
"source": [
"! cat docker/Dockerfile"
"! pygmentize docker/Dockerfile"
]
},
{
Expand All @@ -168,108 +157,108 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\"\"\"\n",
"This script is a ResNet training script which uses Tensorflow's Keras interface, and provides an example of how to use SageMaker Debugger when you use your own custom container in SageMaker or your own script outside SageMaker.\n",
"It has been orchestrated with SageMaker Debugger hooks to allow saving tensors during training.\n",
"These hooks have been instrumented to read from a JSON configuration that SageMaker puts in the training container.\n",
"Configuration provided to the SageMaker python SDK when creating a job will be passed on to the hook.\n",
"This allows you to use the same script with different configurations across different runs.\n",
"\n",
"If you use an official SageMaker Framework container (i.e. AWS Deep Learning Container), you do not have to orchestrate your script as below. Hooks are automatically added in those environments. This experience is called a \"zero script change\". For more information, see https://github.com/awslabs/sagemaker-debugger/blob/master/docs/sagemaker.md#zero-script-change. An example of the same is provided at https://github.com/awslabs/amazon-sagemaker-examples/sagemaker-debugger/tensorflow2/tensorflow2_zero_code_change.\n",
"\"\"\"\n",
"\n",
"# Standard Library\n",
"import argparse\n",
"import random\n",
"\n",
"# Third Party\n",
"import numpy as np\n",
"import tensorflow.compat.v2 as tf\n",
"from tensorflow.keras.applications.resnet50 import ResNet50\n",
"from tensorflow.keras.datasets import cifar10\n",
"from tensorflow.keras.utils import to_categorical\n",
"\n",
"# smdebug modification: Import smdebug support for Tensorflow\n",
"import smdebug.tensorflow as smd\n",
"\n",
"\n",
"def train(batch_size, epoch, model, hook):\n",
" (X_train, y_train), (X_valid, y_valid) = cifar10.load_data()\n",
"\n",
" Y_train = to_categorical(y_train, 10)\n",
" Y_valid = to_categorical(y_valid, 10)\n",
"\n",
" X_train = X_train.astype('float32')\n",
" X_valid = X_valid.astype('float32')\n",
"\n",
" mean_image = np.mean(X_train, axis=0)\n",
" X_train -= mean_image\n",
" X_valid -= mean_image\n",
" X_train /= 128.\n",
" X_valid /= 128.\n",
" \n",
" # register hook to save the following scalar values\n",
" hook.save_scalar(\"epoch\", epoch)\n",
" hook.save_scalar(\"batch_size\", batch_size)\n",
" hook.save_scalar(\"train_steps_per_epoch\", len(X_train)/batch_size)\n",
" hook.save_scalar(\"valid_steps_per_epoch\", len(X_valid)/batch_size)\n",
" \n",
" model.fit(X_train, Y_train,\n",
" batch_size=batch_size,\n",
" epochs=epoch,\n",
" validation_data=(X_valid, Y_valid),\n",
" shuffle=False,\n",
" # smdebug modification: Pass the hook as a Keras callback\n",
" callbacks=[hook])\n",
"\n",
"\n",
"def main():\n",
" parser = argparse.ArgumentParser(description=\"Train resnet50 cifar10\")\n",
" parser.add_argument(\"--batch_size\", type=int, default=50)\n",
" parser.add_argument(\"--epoch\", type=int, default=15)\n",
" parser.add_argument(\"--model_dir\", type=str, default=\"./model_keras_resnet\")\n",
" parser.add_argument(\"--lr\", type=float, default=0.001)\n",
" parser.add_argument(\"--random_seed\", type=bool, default=False)\n",
" \n",
" args = parser.parse_args()\n",
"\n",
" if args.random_seed:\n",
" tf.random.set_seed(2)\n",
" np.random.seed(2)\n",
" random.seed(12)\n",
"\n",
" \n",
" mirrored_strategy = tf.distribute.MirroredStrategy()\n",
" with mirrored_strategy.scope():\n",
" \n",
" model = ResNet50(weights=None, input_shape=(32,32,3), classes=10)\n",
"\n",
" # smdebug modification:\n",
" # Create hook from the configuration provided through sagemaker python sdk.\n",
" # This configuration is provided in the form of a JSON file.\n",
" # Default JSON configuration file:\n",
" # {\n",
" # \"LocalPath\": <path on device where tensors will be saved>\n",
" # }\"\n",
" # Alternatively, you could pass custom debugger configuration (using DebuggerHookConfig)\n",
" # through SageMaker Estimator. For more information, https://github.com/aws/sagemaker-python-sdk/blob/master/doc/amazon_sagemaker_debugger.rst\n",
" hook = smd.KerasHook.create_from_json_file()\n",
"\n",
" opt = tf.keras.optimizers.Adam(learning_rate=args.lr)\n",
" model.compile(loss='categorical_crossentropy',\n",
" optimizer=opt,\n",
" metrics=['accuracy'])\n",
"\n",
" # start the training.\n",
" train(args.batch_size, args.epoch, model, hook)\n",
"\n",
"if __name__ == \"__main__\":\n",
" main()\n"
"\u001b[33m\"\"\"\u001b[39;49;00m\r\n",
"\u001b[33mThis script is a ResNet training script which uses Tensorflow's Keras interface, and provides an example of how to use SageMaker Debugger when you use your own custom container in SageMaker or your own script outside SageMaker.\u001b[39;49;00m\r\n",
"\u001b[33mIt has been orchestrated with SageMaker Debugger hooks to allow saving tensors during training.\u001b[39;49;00m\r\n",
"\u001b[33mThese hooks have been instrumented to read from a JSON configuration that SageMaker puts in the training container.\u001b[39;49;00m\r\n",
"\u001b[33mConfiguration provided to the SageMaker python SDK when creating a job will be passed on to the hook.\u001b[39;49;00m\r\n",
"\u001b[33mThis allows you to use the same script with different configurations across different runs.\u001b[39;49;00m\r\n",
"\u001b[33m\u001b[39;49;00m\r\n",
"\u001b[33mIf you use an official SageMaker Framework container (i.e. AWS Deep Learning Container), you do not have to orchestrate your script as below. Hooks are automatically added in those environments. This experience is called a \"zero script change\". For more information, see https://github.com/awslabs/sagemaker-debugger/blob/master/docs/sagemaker.md#zero-script-change. An example of the same is provided at https://github.com/awslabs/amazon-sagemaker-examples/sagemaker-debugger/tensorflow2/tensorflow2_zero_code_change.\u001b[39;49;00m\r\n",
"\u001b[33m\"\"\"\u001b[39;49;00m\r\n",
"\r\n",
"\u001b[37m# Standard Library\u001b[39;49;00m\r\n",
"\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36margparse\u001b[39;49;00m\r\n",
"\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mrandom\u001b[39;49;00m\r\n",
"\r\n",
"\u001b[37m# Third Party\u001b[39;49;00m\r\n",
"\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mnumpy\u001b[39;49;00m \u001b[34mas\u001b[39;49;00m \u001b[04m\u001b[36mnp\u001b[39;49;00m\r\n",
"\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mtensorflow\u001b[39;49;00m\u001b[04m\u001b[36m.\u001b[39;49;00m\u001b[04m\u001b[36mcompat\u001b[39;49;00m\u001b[04m\u001b[36m.\u001b[39;49;00m\u001b[04m\u001b[36mv2\u001b[39;49;00m \u001b[34mas\u001b[39;49;00m \u001b[04m\u001b[36mtf\u001b[39;49;00m\r\n",
"\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36mtensorflow\u001b[39;49;00m\u001b[04m\u001b[36m.\u001b[39;49;00m\u001b[04m\u001b[36mkeras\u001b[39;49;00m\u001b[04m\u001b[36m.\u001b[39;49;00m\u001b[04m\u001b[36mapplications\u001b[39;49;00m\u001b[04m\u001b[36m.\u001b[39;49;00m\u001b[04m\u001b[36mresnet50\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m ResNet50\r\n",
"\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36mtensorflow\u001b[39;49;00m\u001b[04m\u001b[36m.\u001b[39;49;00m\u001b[04m\u001b[36mkeras\u001b[39;49;00m\u001b[04m\u001b[36m.\u001b[39;49;00m\u001b[04m\u001b[36mdatasets\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m cifar10\r\n",
"\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36mtensorflow\u001b[39;49;00m\u001b[04m\u001b[36m.\u001b[39;49;00m\u001b[04m\u001b[36mkeras\u001b[39;49;00m\u001b[04m\u001b[36m.\u001b[39;49;00m\u001b[04m\u001b[36mutils\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m to_categorical\r\n",
"\r\n",
"\u001b[37m# smdebug modification: Import smdebug support for Tensorflow\u001b[39;49;00m\r\n",
"\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36msmdebug\u001b[39;49;00m\u001b[04m\u001b[36m.\u001b[39;49;00m\u001b[04m\u001b[36mtensorflow\u001b[39;49;00m \u001b[34mas\u001b[39;49;00m \u001b[04m\u001b[36msmd\u001b[39;49;00m\r\n",
"\r\n",
"\r\n",
"\u001b[34mdef\u001b[39;49;00m \u001b[32mtrain\u001b[39;49;00m(batch_size, epoch, model, hook):\r\n",
" (X_train, y_train), (X_valid, y_valid) = cifar10.load_data()\r\n",
"\r\n",
" Y_train = to_categorical(y_train, \u001b[34m10\u001b[39;49;00m)\r\n",
" Y_valid = to_categorical(y_valid, \u001b[34m10\u001b[39;49;00m)\r\n",
"\r\n",
" X_train = X_train.astype(\u001b[33m'\u001b[39;49;00m\u001b[33mfloat32\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n",
" X_valid = X_valid.astype(\u001b[33m'\u001b[39;49;00m\u001b[33mfloat32\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m)\r\n",
"\r\n",
" mean_image = np.mean(X_train, axis=\u001b[34m0\u001b[39;49;00m)\r\n",
" X_train -= mean_image\r\n",
" X_valid -= mean_image\r\n",
" X_train /= \u001b[34m128.\u001b[39;49;00m\r\n",
" X_valid /= \u001b[34m128.\u001b[39;49;00m\r\n",
" \r\n",
" \u001b[37m# register hook to save the following scalar values\u001b[39;49;00m\r\n",
" hook.save_scalar(\u001b[33m\"\u001b[39;49;00m\u001b[33mepoch\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, epoch)\r\n",
" hook.save_scalar(\u001b[33m\"\u001b[39;49;00m\u001b[33mbatch_size\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, batch_size)\r\n",
" hook.save_scalar(\u001b[33m\"\u001b[39;49;00m\u001b[33mtrain_steps_per_epoch\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, \u001b[36mlen\u001b[39;49;00m(X_train)/batch_size)\r\n",
" hook.save_scalar(\u001b[33m\"\u001b[39;49;00m\u001b[33mvalid_steps_per_epoch\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, \u001b[36mlen\u001b[39;49;00m(X_valid)/batch_size)\r\n",
" \r\n",
" model.fit(X_train, Y_train,\r\n",
" batch_size=batch_size,\r\n",
" epochs=epoch,\r\n",
" validation_data=(X_valid, Y_valid),\r\n",
" shuffle=\u001b[34mFalse\u001b[39;49;00m,\r\n",
" \u001b[37m# smdebug modification: Pass the hook as a Keras callback\u001b[39;49;00m\r\n",
" callbacks=[hook])\r\n",
"\r\n",
"\r\n",
"\u001b[34mdef\u001b[39;49;00m \u001b[32mmain\u001b[39;49;00m():\r\n",
" parser = argparse.ArgumentParser(description=\u001b[33m\"\u001b[39;49;00m\u001b[33mTrain resnet50 cifar10\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\r\n",
" parser.add_argument(\u001b[33m\"\u001b[39;49;00m\u001b[33m--batch_size\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, \u001b[36mtype\u001b[39;49;00m=\u001b[36mint\u001b[39;49;00m, default=\u001b[34m50\u001b[39;49;00m)\r\n",
" parser.add_argument(\u001b[33m\"\u001b[39;49;00m\u001b[33m--epoch\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, \u001b[36mtype\u001b[39;49;00m=\u001b[36mint\u001b[39;49;00m, default=\u001b[34m15\u001b[39;49;00m)\r\n",
" parser.add_argument(\u001b[33m\"\u001b[39;49;00m\u001b[33m--model_dir\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, \u001b[36mtype\u001b[39;49;00m=\u001b[36mstr\u001b[39;49;00m, default=\u001b[33m\"\u001b[39;49;00m\u001b[33m./model_keras_resnet\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\r\n",
" parser.add_argument(\u001b[33m\"\u001b[39;49;00m\u001b[33m--lr\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, \u001b[36mtype\u001b[39;49;00m=\u001b[36mfloat\u001b[39;49;00m, default=\u001b[34m0.001\u001b[39;49;00m)\r\n",
" parser.add_argument(\u001b[33m\"\u001b[39;49;00m\u001b[33m--random_seed\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m, \u001b[36mtype\u001b[39;49;00m=\u001b[36mbool\u001b[39;49;00m, default=\u001b[34mFalse\u001b[39;49;00m)\r\n",
" \r\n",
" args = parser.parse_args()\r\n",
"\r\n",
" \u001b[34mif\u001b[39;49;00m args.random_seed:\r\n",
" tf.random.set_seed(\u001b[34m2\u001b[39;49;00m)\r\n",
" np.random.seed(\u001b[34m2\u001b[39;49;00m)\r\n",
" random.seed(\u001b[34m12\u001b[39;49;00m)\r\n",
"\r\n",
" \r\n",
" mirrored_strategy = tf.distribute.MirroredStrategy()\r\n",
" \u001b[34mwith\u001b[39;49;00m mirrored_strategy.scope():\r\n",
" \r\n",
" model = ResNet50(weights=\u001b[34mNone\u001b[39;49;00m, input_shape=(\u001b[34m32\u001b[39;49;00m,\u001b[34m32\u001b[39;49;00m,\u001b[34m3\u001b[39;49;00m), classes=\u001b[34m10\u001b[39;49;00m)\r\n",
"\r\n",
" \u001b[37m# smdebug modification:\u001b[39;49;00m\r\n",
" \u001b[37m# Create hook from the configuration provided through sagemaker python sdk.\u001b[39;49;00m\r\n",
" \u001b[37m# This configuration is provided in the form of a JSON file.\u001b[39;49;00m\r\n",
" \u001b[37m# Default JSON configuration file:\u001b[39;49;00m\r\n",
" \u001b[37m# {\u001b[39;49;00m\r\n",
" \u001b[37m# \"LocalPath\": <path on device where tensors will be saved>\u001b[39;49;00m\r\n",
" \u001b[37m# }\"\u001b[39;49;00m\r\n",
" \u001b[37m# Alternatively, you could pass custom debugger configuration (using DebuggerHookConfig)\u001b[39;49;00m\r\n",
" \u001b[37m# through SageMaker Estimator. For more information, https://github.com/aws/sagemaker-python-sdk/blob/master/doc/amazon_sagemaker_debugger.rst\u001b[39;49;00m\r\n",
" hook = smd.KerasHook.create_from_json_file()\r\n",
"\r\n",
" opt = tf.keras.optimizers.Adam(learning_rate=args.lr)\r\n",
" model.compile(loss=\u001b[33m'\u001b[39;49;00m\u001b[33mcategorical_crossentropy\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m,\r\n",
" optimizer=opt,\r\n",
" metrics=[\u001b[33m'\u001b[39;49;00m\u001b[33maccuracy\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m])\r\n",
"\r\n",
" \u001b[37m# start the training.\u001b[39;49;00m\r\n",
" train(args.batch_size, args.epoch, model, hook)\r\n",
"\r\n",
"\u001b[34mif\u001b[39;49;00m \u001b[31m__name__\u001b[39;49;00m == \u001b[33m\"\u001b[39;49;00m\u001b[33m__main__\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m:\r\n",
" main()\r\n"
]
}
],
"source": [
"! cat docker/tf_keras_resnet_byoc.py"
"! pygmentize docker/tf_keras_resnet_byoc.py"
]
},
{
Expand Down Expand Up @@ -451,7 +440,7 @@
"\n",
"Construct a SageMaker Estimator using the image URI of the custom training container you created in **Step 3**.\n",
"\n",
"**Note:** This example uses the SageMaker Python SDK v2. If you want to use the SageMaker Python SDK v1, you need to change the parameter names. You can find the SageMaker Estimator parameters at [Get Started with Custom Training Containers](https://docs.aws.amazon.com/sagemaker/latest/dg/build-container-to-train-script-get-started.html#byoc-training-step5) in the AWS SageMaker Developer Guide or at [the SageMaker Estimator API](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) in one of the older version of SageMaker Python SDK documentation."
"**Note:** This example uses the SageMaker Python SDK v1. If you want to use the SageMaker Python SDK v2, you need to change the parameter names. You can find the SageMaker Estimator parameters at [Get Started with Custom Training Containers](https://docs.aws.amazon.com/sagemaker/latest/dg/build-container-to-train-script-get-started.html#byoc-training-step5) in the AWS SageMaker Developer Guide or at [the SageMaker Estimator API](https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html) in one of the older version of SageMaker Python SDK documentation."
]
},
{
Expand All @@ -462,15 +451,14 @@
"source": [
"from sagemaker.estimator import Estimator\n",
"from sagemaker import get_execution_role\n",
"from sagemaker.tensorflow import TensorFlow\n",
"\n",
"role = get_execution_role()\n",
"\n",
"estimator = Estimator(\n",
" image_uri=byoc_image_uri,\n",
" image_name=byoc_image_uri,\n",
" role=role,\n",
" instance_count=1,\n",
" instance_type=\"ml.p3.16xlarge\",\n",
" train_instance_count=1,\n",
" train_instance_type=\"ml.p3.16xlarge\",\n",
"\n",
" # Debugger-specific parameters\n",
" rules = rules,\n",
Expand Down Expand Up @@ -612,6 +600,8 @@
"metadata": {},
"outputs": [],
"source": [
"from smdebug.core.modes import ModeKeys\n",
"\n",
"len(trial.tensor('loss').steps(mode=ModeKeys.TRAIN))"
]
},
Expand Down Expand Up @@ -672,7 +662,6 @@
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"from smdebug.core.modes import ModeKeys\n",
"\n",
"# Retrieve the loss tensors collected in training mode\n",
"y = []\n",
Expand Down