Merge pull request aws#185 from awslabs/arpin_xgboost_dm_fix

djarpin · web-flow · commit b93d1770c7c5 · 2018-02-07T10:43:54.000-08:00
Fixed: XGBoost direct marketing.  Train with CSV and update markdown.
diff --git a/introduction_to_applying_machine_learning/xgboost_direct_marketing/xgboost_direct_marketing_sagemaker.ipynb b/introduction_to_applying_machine_learning/xgboost_direct_marketing/xgboost_direct_marketing_sagemaker.ipynb
@@ -53,6 +53,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
+    "collapsed": true,
     "isConfigCell": true
    },
    "outputs": [],
@@ -78,15 +79,16 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np                                # For matrix operations and numerical processing\n",
     "import pandas as pd                               # For munging tabular data\n",
     "import matplotlib.pyplot as plt                   # For charts and visualizations\n",
     "from IPython.display import Image                 # For displaying images in the notebook\n",
     "from IPython.display import display               # For displaying outputs in the notebook\n",
-    "from sklearn.datasets import dump_svmlight_file   # For outputting data to libsvm format for xgboost\n",
     "from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.\n",
     "import sys                                        # For writing outputs to notebook\n",
     "import math                                       # For ceiling function\n",
@@ -298,7 +300,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "data['no_previous_contact'] = np.where(data['pdays'] == 999, 1, 0)                                 # Indicator variable to capture when pdays takes a value of 999\n",
@@ -320,7 +324,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "model_data = model_data.drop(['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'], axis=1)"
@@ -338,7 +344,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data)), int(0.9 * len(model_data))])   # Randomly sort the data then split out first 70%, second 20%, and last 10%"
@@ -348,18 +356,19 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Amazon SageMaker's XGBoost container expects data in the libSVM data format.  This expects features and the target variable to be provided as separate arguments.  Let's split these apart.  Notice that although repetitive it's easiest to do this after the train|validation|test split rather than before.  This avoids any misalignment issues due to random reordering."
+    "Amazon SageMaker's XGBoost container expects data in the libSVM or CSV data format.  For this example, we'll stick to CSV.  Note that the first column must be the target variable and the CSV should not include headers.  Also, notice that although repetitive it's easiest to do this after the train|validation|test split rather than before.  This avoids any misalignment issues due to random reordering."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
-    "dump_svmlight_file(X=train_data.drop(['y_no', 'y_yes'], axis=1), y=train_data['y_yes'], f='train.libsvm')\n",
-    "dump_svmlight_file(X=validation_data.drop(['y_no', 'y_yes'], axis=1), y=validation_data['y_yes'], f='validation.libsvm')\n",
-    "dump_svmlight_file(X=test_data.drop(['y_no', 'y_yes'], axis=1), y=test_data['y_yes'], f='test.libsvm')"
+    "pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)\n",
+    "pd.concat([validation_data['y_yes'], validation_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('validation.csv', index=False, header=False)"
    ]
   },
   {
@@ -372,11 +381,13 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
-    "boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.libsvm')).upload_file('train.libsvm')\n",
-    "boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.libsvm')).upload_file('validation.libsvm')"
+    "boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')\n",
+    "boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')"
    ]
   },
   {
@@ -398,7 +409,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',\n",
@@ -411,17 +424,19 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Then, because we're training with the libSVM file format, we'll create `s3_input`s that our training function can use as a pointer to the files in S3, which also specify that the content type is libSVM."
+    "Then, because we're training with the CSV file format, we'll create `s3_input`s that our training function can use as a pointer to the files in S3, which also specify that the content type is CSV."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
-    "s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='libsvm')\n",
-    "s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='libsvm')"
+    "s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')\n",
+    "s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')"
    ]
   },
   {
@@ -478,7 +493,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "xgb_predictor = xgb.deploy(initial_instance_count=1,\n",
@@ -494,13 +511,17 @@
     "## Evaluation\n",
     "There are many ways to compare the performance of a machine learning model, but let's start by simply comparing actual to predicted values.  In this case, we're simply predicting whether the customer subscribed to a term deposit (`1`) or not (`0`), which produces a simple confusion matrix.\n",
     "\n",
-    "First we'll need to determine how we pass data into and receive data from our endpoint.  Our data is currently stored as NumPy arrays in memory of our notebook instance.  To send it in an HTTP POST request, we'll serialize it as a CSV string and then decode the resulting CSV."
+    "First we'll need to determine how we pass data into and receive data from our endpoint.  Our data is currently stored as NumPy arrays in memory of our notebook instance.  To send it in an HTTP POST request, we'll serialize it as a CSV string and then decode the resulting CSV.\n",
+    "\n",
+    "*Note: For inference with CSV format, SageMaker XGBoost requires that the data does NOT include the target variable.*"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "xgb_predictor.content_type = 'text/csv'\n",
@@ -514,15 +535,17 @@
     "Now, we'll use a simple function to:\n",
     "1. Loop over our test dataset\n",
     "1. Split it into mini-batches of rows \n",
-    "1. Convert those mini-batchs to CSV string payloads\n",
+    "1. Convert those mini-batches to CSV string payloads (notice, we drop the target variable from our dataset first)\n",
     "1. Retrieve mini-batch predictions by invoking the XGBoost endpoint\n",
     "1. Collect predictions and convert from the CSV output our model provides into a NumPy array"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def predict(data, rows=500):\n",
@@ -546,7 +569,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions), rownames=['actuals'], colnames=['predictions'])"
@@ -556,7 +581,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "So, of the ~3700 potential customers we predicted would subscribe, 428 of them actually did.  We also had 55 subscribers who subscribed that we did not predict would.  This is less than desirable, but the model can (and should) be tuned to improve this.  Most importantly, note that with minimal effort, our model produced accuracies similar to those published [here](http://media.salford-systems.com/video/tutorial/2015/targeted_marketing.pdf).\n",
+    "So, of the ~4000 potential customers, we predicted 136 would subscribe and 94 of them actually did.  We also had 389 subscribers who subscribed that we did not predict would.  This is less than desirable, but the model can (and should) be tuned to improve this.  Most importantly, note that with minimal effort, our model produced accuracies similar to those published [here](http://media.salford-systems.com/video/tutorial/2015/targeted_marketing.pdf).\n",
     "\n",
     "_Note that because there is some element of randomness in the algorithm's subsample, your results may differ slightly from the text written above._"
    ]
@@ -569,7 +594,7 @@
     "\n",
     "## Extensions\n",
     "\n",
-    "This example analyzed a relatively small dataset, but utilized Amazon SageMaker features such as distributed, managed training and real-time model hosting, which could easily be applied to much larger problems.  In order to improve predictive accuracy further, we could explore techniques like hyperparameter tuning, as well as spend more time engineering features by hand.  In a real-worl scenario we may also look for additional datasets to include which contain customer information not available in our initial dataset."
+    "This example analyzed a relatively small dataset, but utilized Amazon SageMaker features such as distributed, managed training and real-time model hosting, which could easily be applied to much larger problems.  In order to improve predictive accuracy further, we could tweak value we threshold our predictions at to alter the mix of false-positives and false-negatives, or we could explore techniques like hyperparameter tuning.  In a real-world scenario, we would also spend more time engineering features by hand and would likely look for additional datasets to include which contain customer information not available in our initial dataset."
    ]
   },
   {
@@ -584,7 +609,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)"
@@ -593,7 +620,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Environment (conda_python3)",
+   "display_name": "conda_python3",
    "language": "python",
    "name": "conda_python3"
   },
@@ -607,7 +634,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.3"
+   "version": "3.6.2"
   },
   "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.  Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
  },