|
53 | 53 | "cell_type": "code",
|
54 | 54 | "execution_count": null,
|
55 | 55 | "metadata": {
|
56 |
| - "collapsed": true, |
57 | 56 | "isConfigCell": true,
|
58 | 57 | "tags": [
|
59 | 58 | "parameters"
|
60 | 59 | ]
|
61 | 60 | },
|
62 | 61 | "outputs": [],
|
63 | 62 | "source": [
|
64 |
| - "bucket = '<your_s3_bucket_name_here>'\n", |
65 |
| - "prefix = 'sagemaker/DEMO-xgboost-dm'\n", |
66 |
| - " \n", |
67 |
| - "# Define IAM role\n", |
68 | 63 | "import boto3\n",
|
| 64 | + "import pandas\n", |
69 | 65 | "import re\n",
|
| 66 | + "import sagemaker # Amazon SageMaker's Python SDK provides many helper functions\n", |
70 | 67 | "from sagemaker import get_execution_role\n",
|
71 | 68 | "\n",
|
72 |
| - "role = get_execution_role()" |
| 69 | + "\n", |
| 70 | + " # Define IAM role\n", |
| 71 | + "role = get_execution_role()\n", |
| 72 | + "\n", |
| 73 | + "sess = sagemaker.Session()\n", |
| 74 | + "\n", |
| 75 | + "prefix = 'sagemaker/DEMO-xgboost-dm'\n", |
| 76 | + "\n", |
| 77 | + "# S3 bucket which will contain the train and test datasets\n", |
| 78 | + "bucket = sess.default_bucket()" |
73 | 79 | ]
|
74 | 80 | },
|
75 | 81 | {
|
|
82 | 88 | {
|
83 | 89 | "cell_type": "code",
|
84 | 90 | "execution_count": null,
|
85 |
| - "metadata": { |
86 |
| - "collapsed": true |
87 |
| - }, |
| 91 | + "metadata": {}, |
88 | 92 | "outputs": [],
|
89 | 93 | "source": [
|
90 | 94 | "import numpy as np # For matrix operations and numerical processing\n",
|
|
97 | 101 | "import math # For ceiling function\n",
|
98 | 102 | "import json # For parsing hosting outputs\n",
|
99 | 103 | "import os # For manipulating filepath names\n",
|
100 |
| - "import sagemaker # Amazon SageMaker's Python SDK provides many helper functions\n", |
101 |
| - "from sagemaker.predictor import csv_serializer # Converts strings for HTTP POST requests on inference" |
| 104 | + "import zipfile # unzips the dataset\n", |
| 105 | + "from sagemaker import image_uris\n", |
| 106 | + "from sagemaker.inputs import TrainingInput\n", |
| 107 | + "from sagemaker.estimator import Estimator\n", |
| 108 | + "from sagemaker.serializers import CSVSerializer" |
102 | 109 | ]
|
103 | 110 | },
|
104 | 111 | {
|
|
116 | 123 | {
|
117 | 124 | "cell_type": "code",
|
118 | 125 | "execution_count": null,
|
119 |
| - "metadata": { |
120 |
| - "collapsed": true |
121 |
| - }, |
| 126 | + "metadata": {}, |
122 | 127 | "outputs": [],
|
123 | 128 | "source": [
|
124 | 129 | "!wget https://sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com/autopilot/direct_marketing/bank-additional.zip\n",
|
125 |
| - "!apt-get install unzip -y\n", |
126 |
| - "!unzip -o bank-additional.zip" |
| 130 | + "\n", |
| 131 | + "with zipfile.ZipFile('bank-additional.zip', 'r') as zip_ref:\n", |
| 132 | + " zip_ref.extractall('.')" |
127 | 133 | ]
|
128 | 134 | },
|
129 | 135 | {
|
|
136 | 142 | {
|
137 | 143 | "cell_type": "code",
|
138 | 144 | "execution_count": null,
|
139 |
| - "metadata": { |
140 |
| - "collapsed": true |
141 |
| - }, |
| 145 | + "metadata": {}, |
142 | 146 | "outputs": [],
|
143 | 147 | "source": [
|
144 | 148 | "data = pd.read_csv('./bank-additional/bank-additional-full.csv')\n",
|
|
204 | 208 | {
|
205 | 209 | "cell_type": "code",
|
206 | 210 | "execution_count": null,
|
207 |
| - "metadata": { |
208 |
| - "collapsed": true, |
209 |
| - "scrolled": false |
210 |
| - }, |
| 211 | + "metadata": {}, |
211 | 212 | "outputs": [],
|
212 | 213 | "source": [
|
213 | 214 | "# Frequency tables for each categorical feature\n",
|
|
242 | 243 | {
|
243 | 244 | "cell_type": "code",
|
244 | 245 | "execution_count": null,
|
245 |
| - "metadata": { |
246 |
| - "collapsed": true, |
247 |
| - "scrolled": false |
248 |
| - }, |
| 246 | + "metadata": {}, |
249 | 247 | "outputs": [],
|
250 | 248 | "source": [
|
251 | 249 | "for column in data.select_dtypes(include=['object']).columns:\n",
|
|
273 | 271 | {
|
274 | 272 | "cell_type": "code",
|
275 | 273 | "execution_count": null,
|
276 |
| - "metadata": { |
277 |
| - "collapsed": true |
278 |
| - }, |
| 274 | + "metadata": {}, |
279 | 275 | "outputs": [],
|
280 | 276 | "source": [
|
281 | 277 | "display(data.corr())\n",
|
|
314 | 310 | {
|
315 | 311 | "cell_type": "code",
|
316 | 312 | "execution_count": null,
|
317 |
| - "metadata": { |
318 |
| - "collapsed": true |
319 |
| - }, |
| 313 | + "metadata": {}, |
320 | 314 | "outputs": [],
|
321 | 315 | "source": [
|
322 | 316 | "data['no_previous_contact'] = np.where(data['pdays'] == 999, 1, 0) # Indicator variable to capture when pdays takes a value of 999\n",
|
|
338 | 332 | {
|
339 | 333 | "cell_type": "code",
|
340 | 334 | "execution_count": null,
|
341 |
| - "metadata": { |
342 |
| - "collapsed": true |
343 |
| - }, |
| 335 | + "metadata": {}, |
344 | 336 | "outputs": [],
|
345 | 337 | "source": [
|
346 | 338 | "model_data = model_data.drop(['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'], axis=1)"
|
|
358 | 350 | {
|
359 | 351 | "cell_type": "code",
|
360 | 352 | "execution_count": null,
|
361 |
| - "metadata": { |
362 |
| - "collapsed": true |
363 |
| - }, |
| 353 | + "metadata": {}, |
364 | 354 | "outputs": [],
|
365 | 355 | "source": [
|
366 | 356 | "train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data)), int(0.9 * len(model_data))]) # Randomly sort the data then split out first 70%, second 20%, and last 10%"
|
|
376 | 366 | {
|
377 | 367 | "cell_type": "code",
|
378 | 368 | "execution_count": null,
|
379 |
| - "metadata": { |
380 |
| - "collapsed": true |
381 |
| - }, |
| 369 | + "metadata": {}, |
382 | 370 | "outputs": [],
|
383 | 371 | "source": [
|
384 | 372 | "pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)\n",
|
|
395 | 383 | {
|
396 | 384 | "cell_type": "code",
|
397 | 385 | "execution_count": null,
|
398 |
| - "metadata": { |
399 |
| - "collapsed": true |
400 |
| - }, |
| 386 | + "metadata": {}, |
401 | 387 | "outputs": [],
|
402 | 388 | "source": [
|
403 |
| - "boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')\n", |
404 |
| - "boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')" |
| 389 | + "sess.upload_data(path='train.csv', bucket=bucket, key_prefix=prefix+'/train/train.csv')\n", |
| 390 | + "sess.upload_data(path='validation.csv', bucket=bucket, key_prefix=prefix+'/validation/validation.csv')" |
405 | 391 | ]
|
406 | 392 | },
|
407 | 393 | {
|
|
423 | 409 | {
|
424 | 410 | "cell_type": "code",
|
425 | 411 | "execution_count": null,
|
426 |
| - "metadata": { |
427 |
| - "collapsed": true |
428 |
| - }, |
| 412 | + "metadata": {}, |
429 | 413 | "outputs": [],
|
430 | 414 | "source": [
|
431 |
| - "from sagemaker.amazon.amazon_estimator import get_image_uri\n", |
432 |
| - "container = get_image_uri(boto3.Session().region_name, 'xgboost')" |
| 415 | + "container = image_uris.retrieve('xgboost', boto3.Session().region_name, '1.2-1' )" |
433 | 416 | ]
|
434 | 417 | },
|
435 | 418 | {
|
|
442 | 425 | {
|
443 | 426 | "cell_type": "code",
|
444 | 427 | "execution_count": null,
|
445 |
| - "metadata": { |
446 |
| - "collapsed": true |
447 |
| - }, |
| 428 | + "metadata": {}, |
448 | 429 | "outputs": [],
|
449 | 430 | "source": [
|
450 |
| - "s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')\n", |
451 |
| - "s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')" |
| 431 | + "s3_input_train = TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')\n", |
| 432 | + "s3_input_validation = TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')" |
452 | 433 | ]
|
453 | 434 | },
|
454 | 435 | {
|
|
469 | 450 | {
|
470 | 451 | "cell_type": "code",
|
471 | 452 | "execution_count": null,
|
472 |
| - "metadata": { |
473 |
| - "collapsed": true |
474 |
| - }, |
| 453 | + "metadata": {}, |
475 | 454 | "outputs": [],
|
476 | 455 | "source": [
|
477 |
| - "sess = sagemaker.Session()\n", |
| 456 | + "xgb = Estimator(container,\n", |
| 457 | + " role,\n", |
| 458 | + " instance_count=1,\n", |
| 459 | + " instance_type='ml.m4.xlarge',\n", |
| 460 | + " output_path='s3://{}/{}/output'.format(bucket, prefix))\n", |
478 | 461 | "\n",
|
479 |
| - "xgb = sagemaker.estimator.Estimator(container,\n", |
480 |
| - " role, \n", |
481 |
| - " train_instance_count=1, \n", |
482 |
| - " train_instance_type='ml.m4.xlarge',\n", |
483 |
| - " output_path='s3://{}/{}/output'.format(bucket, prefix),\n", |
484 |
| - " sagemaker_session=sess)\n", |
485 | 462 | "xgb.set_hyperparameters(max_depth=5,\n",
|
486 | 463 | " eta=0.2,\n",
|
487 | 464 | " gamma=4,\n",
|
488 | 465 | " min_child_weight=6,\n",
|
489 | 466 | " subsample=0.8,\n",
|
490 |
| - " silent=0,\n", |
491 | 467 | " objective='binary:logistic',\n",
|
492 | 468 | " num_round=100)\n",
|
493 | 469 | "\n",
|
|
507 | 483 | {
|
508 | 484 | "cell_type": "code",
|
509 | 485 | "execution_count": null,
|
510 |
| - "metadata": { |
511 |
| - "collapsed": true |
512 |
| - }, |
| 486 | + "metadata": {}, |
513 | 487 | "outputs": [],
|
514 | 488 | "source": [
|
515 | 489 | "xgb_predictor = xgb.deploy(initial_instance_count=1,\n",
|
|
533 | 507 | {
|
534 | 508 | "cell_type": "code",
|
535 | 509 | "execution_count": null,
|
536 |
| - "metadata": { |
537 |
| - "collapsed": true |
538 |
| - }, |
| 510 | + "metadata": {}, |
539 | 511 | "outputs": [],
|
540 | 512 | "source": [
|
541 |
| - "xgb_predictor.content_type = 'text/csv'\n", |
542 |
| - "xgb_predictor.serializer = csv_serializer" |
| 513 | + "xgb_predictor.serializer = CSVSerializer()" |
543 | 514 | ]
|
544 | 515 | },
|
545 | 516 | {
|
|
557 | 528 | {
|
558 | 529 | "cell_type": "code",
|
559 | 530 | "execution_count": null,
|
560 |
| - "metadata": { |
561 |
| - "collapsed": true |
562 |
| - }, |
| 531 | + "metadata": {}, |
563 | 532 | "outputs": [],
|
564 | 533 | "source": [
|
565 | 534 | "def predict(data, rows=500):\n",
|
|
583 | 552 | {
|
584 | 553 | "cell_type": "code",
|
585 | 554 | "execution_count": null,
|
586 |
| - "metadata": { |
587 |
| - "collapsed": true |
588 |
| - }, |
| 555 | + "metadata": {}, |
589 | 556 | "outputs": [],
|
590 | 557 | "source": [
|
591 | 558 | "pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions), rownames=['actuals'], colnames=['predictions'])"
|
|
623 | 590 | {
|
624 | 591 | "cell_type": "code",
|
625 | 592 | "execution_count": null,
|
626 |
| - "metadata": { |
627 |
| - "collapsed": true |
628 |
| - }, |
| 593 | + "metadata": {}, |
629 | 594 | "outputs": [],
|
630 | 595 | "source": [
|
631 |
| - "sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)" |
| 596 | + "xgb_predictor.delete_endpoint()" |
632 | 597 | ]
|
633 | 598 | }
|
634 | 599 | ],
|
635 | 600 | "metadata": {
|
636 | 601 | "celltoolbar": "Tags",
|
637 | 602 | "kernelspec": {
|
638 |
| - "display_name": "conda_python3", |
| 603 | + "display_name": "Python 3 (Data Science)", |
639 | 604 | "language": "python",
|
640 |
| - "name": "conda_python3" |
| 605 | + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0" |
641 | 606 | },
|
642 | 607 | "language_info": {
|
643 | 608 | "codemirror_mode": {
|
|
649 | 614 | "name": "python",
|
650 | 615 | "nbconvert_exporter": "python",
|
651 | 616 | "pygments_lexer": "ipython3",
|
652 |
| - "version": "3.6.2" |
| 617 | + "version": "3.7.6" |
653 | 618 | },
|
654 | 619 | "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
|
655 | 620 | },
|
656 | 621 | "nbformat": 4,
|
657 |
| - "nbformat_minor": 2 |
| 622 | + "nbformat_minor": 4 |
658 | 623 | }
|
0 commit comments