Merge pull request #119 from awslabs/xgboost-updates

vrkhare · web-flow · commit 495bea9f6f1d · 2017-11-29T07:08:25.000-08:00
enclosing get_waiter for training within try-catch
diff --git a/advanced_functionality/handling_kms_encrypted_data/handling_kms_encrypted_data.ipynb b/advanced_functionality/handling_kms_encrypted_data/handling_kms_encrypted_data.ipynb
@@ -50,7 +50,6 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": true,
     "isConfigCell": true
    },
    "outputs": [],
@@ -69,10 +68,10 @@
     "assumed_role = boto3.client('sts').get_caller_identity()['Arn']\n",
     "role = re.sub(r'^(.+)sts::(\\d+):assumed-role/(.+?)/.*$', r'\\1iam::\\2:role/\\3', assumed_role)\n",
     "\n",
-    "kms_key_id = '<your_kms_key_arn_here>'\n",
+    "kms_key_id = '<your-kms-key-id>'\n",
     "\n",
-    "bucket='<your_s3_bucket_name_here>' # put your s3 bucket name here, and create s3 bucket\n",
-    "prefix = 'sagemarker/kms-new'\n",
+    "bucket='<s3-bucket>' # put your s3 bucket name here, and create s3 bucket\n",
+    "prefix = 'sagemaker/kms'\n",
     "# customize to your bucket where you have stored the data\n",
     "bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)"
    ]
@@ -90,17 +89,6 @@
     "We, first, read the dataset from an existing repository into memory. This processing could be done *in situ* by Amazon Athena, Apache Spark in Amazon EMR, Amazon Redshift, etc., assuming the dataset is present in the appropriate location. Then, the next step would be to transfer the data to S3 for use in training. For small datasets, such as the one used below, reading into memory isn't onerous, though it would be for larger datasets."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "!conda install -y -c conda-forge scikit-learn"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -189,9 +177,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "s3 = boto3.client('s3')\n",
@@ -248,9 +234,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%%time\n",
@@ -299,7 +283,7 @@
     "                    \"S3DataDistributionType\": \"FullyReplicated\"\n",
     "                }\n",
     "            },\n",
-    "            \"ContentType\": \"libsvm\",\n",
+    "            \"ContentType\": \"csv\",\n",
     "            \"CompressionType\": \"None\"\n",
     "        },\n",
     "        {\n",
@@ -320,12 +304,17 @@
     "client = boto3.client('sagemaker')\n",
     "client.create_training_job(**create_training_params)\n",
     "\n",
-    "status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']\n",
-    "print(status)\n",
-    "while status !='Completed' and status!='Failed':\n",
-    "    time.sleep(60)\n",
-    "    status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']\n",
-    "    print(status)"
+    "try:\n",
+    "    # wait for the job to finish and report the ending status\n",
+    "    client.get_waiter('TrainingJob_Created').wait(TrainingJobName=job_name)\n",
+    "    training_info = client.describe_training_job(TrainingJobName=job_name)\n",
+    "    status = training_info['TrainingJobStatus']\n",
+    "    print(\"Training job ended with status: \" + status)\n",
+    "except:\n",
+    "    print('Training failed to start')\n",
+    "     # if exception is raised, that means it has failed\n",
+    "    message = client.describe_training_job(TrainingJobName=job_name)['FailureReason']\n",
+    "    print('Training failed with the following error: {}'.format(message))"
    ]
   },
   {
@@ -343,9 +332,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%%time\n",
@@ -384,9 +371,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from time import gmtime, strftime\n",
@@ -416,9 +401,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%%time\n",
@@ -431,18 +414,26 @@
     "    EndpointConfigName=endpoint_config_name)\n",
     "print(create_endpoint_response['EndpointArn'])\n",
     "\n",
-    "resp = client.describe_endpoint(EndpointName=endpoint_name)\n",
-    "status = resp['EndpointStatus']\n",
-    "print(\"Status: \" + status)\n",
     "\n",
-    "while status=='Creating':\n",
-    "    time.sleep(60)\n",
-    "    resp = client.describe_endpoint(EndpointName=endpoint_name)\n",
-    "    status = resp['EndpointStatus']\n",
-    "    print(\"Status: \" + status)\n",
+    "print('EndpointArn = {}'.format(create_endpoint_response['EndpointArn']))\n",
+    "\n",
+    "# get the status of the endpoint\n",
+    "response = client.describe_endpoint(EndpointName=endpoint_name)\n",
+    "status = response['EndpointStatus']\n",
+    "print('EndpointStatus = {}'.format(status))\n",
+    "\n",
     "\n",
-    "print(\"Arn: \" + resp['EndpointArn'])\n",
-    "print(\"Status: \" + status)"
+    "# wait until the status has changed\n",
+    "client.get_waiter('Endpoint_Created').wait(EndpointName=endpoint_name)\n",
+    "\n",
+    "\n",
+    "# print the status of the endpoint\n",
+    "endpoint_response = client.describe_endpoint(EndpointName=endpoint_name)\n",
+    "status = endpoint_response['EndpointStatus']\n",
+    "print('Endpoint creation ended with EndpointStatus = {}'.format(status))\n",
+    "\n",
+    "if status != 'InService':\n",
+    "    raise Exception('Endpoint creation failed.')"
    ]
   },
   {
@@ -508,9 +499,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "%%time\n",
@@ -542,12 +531,10 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "#client.delete_endpoint(EndpointName=endpoint_name)"
+    "# client.delete_endpoint(EndpointName=endpoint_name)"
    ]
   }
  ],