Skip to content

Commit a2a3c37

Browse files
authored
fix unzip package missing, fixed the container version contract (#1761)
* fix unzip package missing, fixed the container version contract * fix import error * fixed the CI issues * upgrade to V2
1 parent 0abd786 commit a2a3c37

File tree

1 file changed

+55
-90
lines changed

1 file changed

+55
-90
lines changed

introduction_to_applying_machine_learning/xgboost_direct_marketing/xgboost_direct_marketing_sagemaker.ipynb

Lines changed: 55 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -53,23 +53,29 @@
5353
"cell_type": "code",
5454
"execution_count": null,
5555
"metadata": {
56-
"collapsed": true,
5756
"isConfigCell": true,
5857
"tags": [
5958
"parameters"
6059
]
6160
},
6261
"outputs": [],
6362
"source": [
64-
"bucket = '<your_s3_bucket_name_here>'\n",
65-
"prefix = 'sagemaker/DEMO-xgboost-dm'\n",
66-
" \n",
67-
"# Define IAM role\n",
6863
"import boto3\n",
64+
"import pandas\n",
6965
"import re\n",
66+
"import sagemaker # Amazon SageMaker's Python SDK provides many helper functions\n",
7067
"from sagemaker import get_execution_role\n",
7168
"\n",
72-
"role = get_execution_role()"
69+
"\n",
70+
" # Define IAM role\n",
71+
"role = get_execution_role()\n",
72+
"\n",
73+
"sess = sagemaker.Session()\n",
74+
"\n",
75+
"prefix = 'sagemaker/DEMO-xgboost-dm'\n",
76+
"\n",
77+
"# S3 bucket which will contain the train and test datasets\n",
78+
"bucket = sess.default_bucket()"
7379
]
7480
},
7581
{
@@ -82,9 +88,7 @@
8288
{
8389
"cell_type": "code",
8490
"execution_count": null,
85-
"metadata": {
86-
"collapsed": true
87-
},
91+
"metadata": {},
8892
"outputs": [],
8993
"source": [
9094
"import numpy as np # For matrix operations and numerical processing\n",
@@ -97,8 +101,11 @@
97101
"import math # For ceiling function\n",
98102
"import json # For parsing hosting outputs\n",
99103
"import os # For manipulating filepath names\n",
100-
"import sagemaker # Amazon SageMaker's Python SDK provides many helper functions\n",
101-
"from sagemaker.predictor import csv_serializer # Converts strings for HTTP POST requests on inference"
104+
"import zipfile # unzips the dataset\n",
105+
"from sagemaker import image_uris\n",
106+
"from sagemaker.inputs import TrainingInput\n",
107+
"from sagemaker.estimator import Estimator\n",
108+
"from sagemaker.serializers import CSVSerializer"
102109
]
103110
},
104111
{
@@ -116,14 +123,13 @@
116123
{
117124
"cell_type": "code",
118125
"execution_count": null,
119-
"metadata": {
120-
"collapsed": true
121-
},
126+
"metadata": {},
122127
"outputs": [],
123128
"source": [
124129
"!wget https://sagemaker-sample-data-us-west-2.s3-us-west-2.amazonaws.com/autopilot/direct_marketing/bank-additional.zip\n",
125-
"!apt-get install unzip -y\n",
126-
"!unzip -o bank-additional.zip"
130+
"\n",
131+
"with zipfile.ZipFile('bank-additional.zip', 'r') as zip_ref:\n",
132+
" zip_ref.extractall('.')"
127133
]
128134
},
129135
{
@@ -136,9 +142,7 @@
136142
{
137143
"cell_type": "code",
138144
"execution_count": null,
139-
"metadata": {
140-
"collapsed": true
141-
},
145+
"metadata": {},
142146
"outputs": [],
143147
"source": [
144148
"data = pd.read_csv('./bank-additional/bank-additional-full.csv')\n",
@@ -204,10 +208,7 @@
204208
{
205209
"cell_type": "code",
206210
"execution_count": null,
207-
"metadata": {
208-
"collapsed": true,
209-
"scrolled": false
210-
},
211+
"metadata": {},
211212
"outputs": [],
212213
"source": [
213214
"# Frequency tables for each categorical feature\n",
@@ -242,10 +243,7 @@
242243
{
243244
"cell_type": "code",
244245
"execution_count": null,
245-
"metadata": {
246-
"collapsed": true,
247-
"scrolled": false
248-
},
246+
"metadata": {},
249247
"outputs": [],
250248
"source": [
251249
"for column in data.select_dtypes(include=['object']).columns:\n",
@@ -273,9 +271,7 @@
273271
{
274272
"cell_type": "code",
275273
"execution_count": null,
276-
"metadata": {
277-
"collapsed": true
278-
},
274+
"metadata": {},
279275
"outputs": [],
280276
"source": [
281277
"display(data.corr())\n",
@@ -314,9 +310,7 @@
314310
{
315311
"cell_type": "code",
316312
"execution_count": null,
317-
"metadata": {
318-
"collapsed": true
319-
},
313+
"metadata": {},
320314
"outputs": [],
321315
"source": [
322316
"data['no_previous_contact'] = np.where(data['pdays'] == 999, 1, 0) # Indicator variable to capture when pdays takes a value of 999\n",
@@ -338,9 +332,7 @@
338332
{
339333
"cell_type": "code",
340334
"execution_count": null,
341-
"metadata": {
342-
"collapsed": true
343-
},
335+
"metadata": {},
344336
"outputs": [],
345337
"source": [
346338
"model_data = model_data.drop(['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'], axis=1)"
@@ -358,9 +350,7 @@
358350
{
359351
"cell_type": "code",
360352
"execution_count": null,
361-
"metadata": {
362-
"collapsed": true
363-
},
353+
"metadata": {},
364354
"outputs": [],
365355
"source": [
366356
"train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data)), int(0.9 * len(model_data))]) # Randomly sort the data then split out first 70%, second 20%, and last 10%"
@@ -376,9 +366,7 @@
376366
{
377367
"cell_type": "code",
378368
"execution_count": null,
379-
"metadata": {
380-
"collapsed": true
381-
},
369+
"metadata": {},
382370
"outputs": [],
383371
"source": [
384372
"pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)\n",
@@ -395,13 +383,11 @@
395383
{
396384
"cell_type": "code",
397385
"execution_count": null,
398-
"metadata": {
399-
"collapsed": true
400-
},
386+
"metadata": {},
401387
"outputs": [],
402388
"source": [
403-
"boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')\n",
404-
"boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')"
389+
"sess.upload_data(path='train.csv', bucket=bucket, key_prefix=prefix+'/train/train.csv')\n",
390+
"sess.upload_data(path='validation.csv', bucket=bucket, key_prefix=prefix+'/validation/validation.csv')"
405391
]
406392
},
407393
{
@@ -423,13 +409,10 @@
423409
{
424410
"cell_type": "code",
425411
"execution_count": null,
426-
"metadata": {
427-
"collapsed": true
428-
},
412+
"metadata": {},
429413
"outputs": [],
430414
"source": [
431-
"from sagemaker.amazon.amazon_estimator import get_image_uri\n",
432-
"container = get_image_uri(boto3.Session().region_name, 'xgboost')"
415+
"container = image_uris.retrieve('xgboost', boto3.Session().region_name, '1.2-1' )"
433416
]
434417
},
435418
{
@@ -442,13 +425,11 @@
442425
{
443426
"cell_type": "code",
444427
"execution_count": null,
445-
"metadata": {
446-
"collapsed": true
447-
},
428+
"metadata": {},
448429
"outputs": [],
449430
"source": [
450-
"s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')\n",
451-
"s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')"
431+
"s3_input_train = TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')\n",
432+
"s3_input_validation = TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')"
452433
]
453434
},
454435
{
@@ -469,25 +450,20 @@
469450
{
470451
"cell_type": "code",
471452
"execution_count": null,
472-
"metadata": {
473-
"collapsed": true
474-
},
453+
"metadata": {},
475454
"outputs": [],
476455
"source": [
477-
"sess = sagemaker.Session()\n",
456+
"xgb = Estimator(container,\n",
457+
" role,\n",
458+
" instance_count=1,\n",
459+
" instance_type='ml.m4.xlarge',\n",
460+
" output_path='s3://{}/{}/output'.format(bucket, prefix))\n",
478461
"\n",
479-
"xgb = sagemaker.estimator.Estimator(container,\n",
480-
" role, \n",
481-
" train_instance_count=1, \n",
482-
" train_instance_type='ml.m4.xlarge',\n",
483-
" output_path='s3://{}/{}/output'.format(bucket, prefix),\n",
484-
" sagemaker_session=sess)\n",
485462
"xgb.set_hyperparameters(max_depth=5,\n",
486463
" eta=0.2,\n",
487464
" gamma=4,\n",
488465
" min_child_weight=6,\n",
489466
" subsample=0.8,\n",
490-
" silent=0,\n",
491467
" objective='binary:logistic',\n",
492468
" num_round=100)\n",
493469
"\n",
@@ -507,9 +483,7 @@
507483
{
508484
"cell_type": "code",
509485
"execution_count": null,
510-
"metadata": {
511-
"collapsed": true
512-
},
486+
"metadata": {},
513487
"outputs": [],
514488
"source": [
515489
"xgb_predictor = xgb.deploy(initial_instance_count=1,\n",
@@ -533,13 +507,10 @@
533507
{
534508
"cell_type": "code",
535509
"execution_count": null,
536-
"metadata": {
537-
"collapsed": true
538-
},
510+
"metadata": {},
539511
"outputs": [],
540512
"source": [
541-
"xgb_predictor.content_type = 'text/csv'\n",
542-
"xgb_predictor.serializer = csv_serializer"
513+
"xgb_predictor.serializer = CSVSerializer()"
543514
]
544515
},
545516
{
@@ -557,9 +528,7 @@
557528
{
558529
"cell_type": "code",
559530
"execution_count": null,
560-
"metadata": {
561-
"collapsed": true
562-
},
531+
"metadata": {},
563532
"outputs": [],
564533
"source": [
565534
"def predict(data, rows=500):\n",
@@ -583,9 +552,7 @@
583552
{
584553
"cell_type": "code",
585554
"execution_count": null,
586-
"metadata": {
587-
"collapsed": true
588-
},
555+
"metadata": {},
589556
"outputs": [],
590557
"source": [
591558
"pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions), rownames=['actuals'], colnames=['predictions'])"
@@ -623,21 +590,19 @@
623590
{
624591
"cell_type": "code",
625592
"execution_count": null,
626-
"metadata": {
627-
"collapsed": true
628-
},
593+
"metadata": {},
629594
"outputs": [],
630595
"source": [
631-
"sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)"
596+
"xgb_predictor.delete_endpoint()"
632597
]
633598
}
634599
],
635600
"metadata": {
636601
"celltoolbar": "Tags",
637602
"kernelspec": {
638-
"display_name": "conda_python3",
603+
"display_name": "Python 3 (Data Science)",
639604
"language": "python",
640-
"name": "conda_python3"
605+
"name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0"
641606
},
642607
"language_info": {
643608
"codemirror_mode": {
@@ -649,10 +614,10 @@
649614
"name": "python",
650615
"nbconvert_exporter": "python",
651616
"pygments_lexer": "ipython3",
652-
"version": "3.6.2"
617+
"version": "3.7.6"
653618
},
654619
"notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
655620
},
656621
"nbformat": 4,
657-
"nbformat_minor": 2
622+
"nbformat_minor": 4
658623
}

0 commit comments

Comments
 (0)