|
49 | 49 | "cell_type": "code",
|
50 | 50 | "execution_count": null,
|
51 | 51 | "metadata": {
|
52 |
| - "collapsed": true, |
53 | 52 | "isConfigCell": true,
|
54 | 53 | "tags": [
|
55 | 54 | "parameters"
|
|
80 | 79 | {
|
81 | 80 | "cell_type": "code",
|
82 | 81 | "execution_count": null,
|
83 |
| - "metadata": { |
84 |
| - "collapsed": true |
85 |
| - }, |
| 82 | + "metadata": {}, |
86 | 83 | "outputs": [],
|
87 | 84 | "source": [
|
88 | 85 | "import pandas as pd\n",
|
|
95 | 92 | "import json\n",
|
96 | 93 | "from IPython.display import display\n",
|
97 | 94 | "from time import strftime, gmtime\n",
|
98 |
| - "import sagemaker\n", |
99 |
| - "from sagemaker.predictor import csv_serializer" |
| 95 | + "from sagemaker.inputs import TrainingInput\n", |
| 96 | + "from sagemaker.serializers import CSVSerializer" |
100 | 97 | ]
|
101 | 98 | },
|
102 | 99 | {
|
|
114 | 111 | {
|
115 | 112 | "cell_type": "code",
|
116 | 113 | "execution_count": null,
|
117 |
| - "metadata": { |
118 |
| - "collapsed": true |
119 |
| - }, |
| 114 | + "metadata": {}, |
120 | 115 | "outputs": [],
|
121 | 116 | "source": [
|
122 | 117 | "!wget http://dataminingconsultant.com/DKD2e_data_sets.zip\n",
|
|
126 | 121 | {
|
127 | 122 | "cell_type": "code",
|
128 | 123 | "execution_count": null,
|
129 |
| - "metadata": { |
130 |
| - "collapsed": true |
131 |
| - }, |
| 124 | + "metadata": {}, |
132 | 125 | "outputs": [],
|
133 | 126 | "source": [
|
134 | 127 | "churn = pd.read_csv('./Data sets/churn.txt')\n",
|
|
166 | 159 | {
|
167 | 160 | "cell_type": "code",
|
168 | 161 | "execution_count": null,
|
169 |
| - "metadata": { |
170 |
| - "collapsed": true |
171 |
| - }, |
| 162 | + "metadata": {}, |
172 | 163 | "outputs": [],
|
173 | 164 | "source": [
|
174 | 165 | "# Frequency tables for each categorical feature\n",
|
|
195 | 186 | {
|
196 | 187 | "cell_type": "code",
|
197 | 188 | "execution_count": null,
|
198 |
| - "metadata": { |
199 |
| - "collapsed": true |
200 |
| - }, |
| 189 | + "metadata": {}, |
201 | 190 | "outputs": [],
|
202 | 191 | "source": [
|
203 | 192 | "churn = churn.drop('Phone', axis=1)\n",
|
|
214 | 203 | {
|
215 | 204 | "cell_type": "code",
|
216 | 205 | "execution_count": null,
|
217 |
| - "metadata": { |
218 |
| - "collapsed": true |
219 |
| - }, |
| 206 | + "metadata": {}, |
220 | 207 | "outputs": [],
|
221 | 208 | "source": [
|
222 | 209 | "for column in churn.select_dtypes(include=['object']).columns:\n",
|
|
246 | 233 | {
|
247 | 234 | "cell_type": "code",
|
248 | 235 | "execution_count": null,
|
249 |
| - "metadata": { |
250 |
| - "collapsed": true |
251 |
| - }, |
| 236 | + "metadata": {}, |
252 | 237 | "outputs": [],
|
253 | 238 | "source": [
|
254 | 239 | "display(churn.corr())\n",
|
|
266 | 251 | {
|
267 | 252 | "cell_type": "code",
|
268 | 253 | "execution_count": null,
|
269 |
| - "metadata": { |
270 |
| - "collapsed": true |
271 |
| - }, |
| 254 | + "metadata": {}, |
272 | 255 | "outputs": [],
|
273 | 256 | "source": [
|
274 | 257 | "churn = churn.drop(['Day Charge', 'Eve Charge', 'Night Charge', 'Intl Charge'], axis=1)"
|
|
290 | 273 | {
|
291 | 274 | "cell_type": "code",
|
292 | 275 | "execution_count": null,
|
293 |
| - "metadata": { |
294 |
| - "collapsed": true |
295 |
| - }, |
| 276 | + "metadata": {}, |
296 | 277 | "outputs": [],
|
297 | 278 | "source": [
|
298 | 279 | "model_data = pd.get_dummies(churn)\n",
|
|
309 | 290 | {
|
310 | 291 | "cell_type": "code",
|
311 | 292 | "execution_count": null,
|
312 |
| - "metadata": { |
313 |
| - "collapsed": true |
314 |
| - }, |
| 293 | + "metadata": {}, |
315 | 294 | "outputs": [],
|
316 | 295 | "source": [
|
317 | 296 | "train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data)), int(0.9 * len(model_data))])\n",
|
|
329 | 308 | {
|
330 | 309 | "cell_type": "code",
|
331 | 310 | "execution_count": null,
|
332 |
| - "metadata": { |
333 |
| - "collapsed": true |
334 |
| - }, |
| 311 | + "metadata": {}, |
335 | 312 | "outputs": [],
|
336 | 313 | "source": [
|
337 | 314 | "boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')\n",
|
|
351 | 328 | {
|
352 | 329 | "cell_type": "code",
|
353 | 330 | "execution_count": null,
|
354 |
| - "metadata": { |
355 |
| - "collapsed": true |
356 |
| - }, |
| 331 | + "metadata": {}, |
357 | 332 | "outputs": [],
|
358 | 333 | "source": [
|
359 |
| - "from sagemaker.amazon.amazon_estimator import get_image_uri\n", |
360 |
| - "container = get_image_uri(boto3.Session().region_name, 'xgboost')" |
| 334 | + "container = sagemaker.image_uris.retrieve('xgboost', boto3.Session().region_name, '1')\n", |
| 335 | + "display(container)" |
361 | 336 | ]
|
362 | 337 | },
|
363 | 338 | {
|
364 | 339 | "cell_type": "markdown",
|
365 | 340 | "metadata": {},
|
366 | 341 | "source": [
|
367 |
| - "Then, because we're training with the CSV file format, we'll create `s3_input`s that our training function can use as a pointer to the files in S3." |
| 342 | + "Then, because we're training with the CSV file format, we'll create `TrainingInput`s that our training function can use as a pointer to the files in S3." |
368 | 343 | ]
|
369 | 344 | },
|
370 | 345 | {
|
371 | 346 | "cell_type": "code",
|
372 | 347 | "execution_count": null,
|
373 |
| - "metadata": { |
374 |
| - "collapsed": true |
375 |
| - }, |
| 348 | + "metadata": {}, |
376 | 349 | "outputs": [],
|
377 | 350 | "source": [
|
378 |
| - "s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')\n", |
379 |
| - "s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')" |
| 351 | + "s3_input_train = TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')\n", |
| 352 | + "s3_input_validation = TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')" |
380 | 353 | ]
|
381 | 354 | },
|
382 | 355 | {
|
|
396 | 369 | {
|
397 | 370 | "cell_type": "code",
|
398 | 371 | "execution_count": null,
|
399 |
| - "metadata": { |
400 |
| - "collapsed": true |
401 |
| - }, |
| 372 | + "metadata": {}, |
402 | 373 | "outputs": [],
|
403 | 374 | "source": [
|
404 | 375 | "sess = sagemaker.Session()\n",
|
405 | 376 | "\n",
|
406 | 377 | "xgb = sagemaker.estimator.Estimator(container,\n",
|
407 | 378 | " role, \n",
|
408 |
| - " train_instance_count=1, \n", |
409 |
| - " train_instance_type='ml.m4.xlarge',\n", |
| 379 | + " instance_count=1, \n", |
| 380 | + " instance_type='ml.m4.xlarge',\n", |
410 | 381 | " output_path='s3://{}/{}/output'.format(bucket, prefix),\n",
|
411 | 382 | " sagemaker_session=sess)\n",
|
412 | 383 | "xgb.set_hyperparameters(max_depth=5,\n",
|
|
437 | 408 | "outputs": [],
|
438 | 409 | "source": [
|
439 | 410 | "compiled_model = xgb\n",
|
440 |
| - "if xgb.create_model().check_neo_region(boto3.Session().region_name) is False:\n", |
441 |
| - " print('Neo is not currently supported in', boto3.Session().region_name)\n", |
442 |
| - "else:\n", |
443 |
| - " output_path = '/'.join(xgb.output_path.split('/')[:-1])\n", |
444 |
| - " compiled_model = xgb.compile_model(target_instance_family='ml_m4', \n", |
445 |
| - " input_shape={'data': [1, 69]},\n", |
446 |
| - " role=role,\n", |
447 |
| - " framework='xgboost',\n", |
448 |
| - " framework_version='0.7',\n", |
449 |
| - " output_path=output_path)\n", |
450 |
| - " compiled_model.name = 'deployed-xgboost-customer-churn'\n", |
451 |
| - " compiled_model.image = get_image_uri(sess.boto_region_name, 'xgboost-neo', repo_version='latest')" |
| 411 | + "output_path = '/'.join(xgb.output_path.split('/')[:-1])\n", |
| 412 | + "compiled_model = xgb.compile_model(target_instance_family='ml_m4', \n", |
| 413 | + " input_shape={'data': [1, 69]},\n", |
| 414 | + " role=role,\n", |
| 415 | + " framework='xgboost',\n", |
| 416 | + " framework_version='latest',\n", |
| 417 | + " output_path=output_path)\n", |
| 418 | + "compiled_model.name = 'deployed-xgboost-customer-churn'" |
452 | 419 | ]
|
453 | 420 | },
|
454 | 421 | {
|
|
464 | 431 | {
|
465 | 432 | "cell_type": "code",
|
466 | 433 | "execution_count": null,
|
467 |
| - "metadata": { |
468 |
| - "collapsed": true |
469 |
| - }, |
| 434 | + "metadata": {}, |
470 | 435 | "outputs": [],
|
471 | 436 | "source": [
|
472 |
| - "xgb_predictor = compiled_model.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')" |
| 437 | + "xgb_predictor = compiled_model.deploy(\n", |
| 438 | + " initial_instance_count = 1, \n", |
| 439 | + " instance_type = 'ml.m4.xlarge',\n", |
| 440 | + " serializer=CSVSerializer())" |
473 | 441 | ]
|
474 | 442 | },
|
475 | 443 | {
|
|
481 | 449 | "Now that we have a hosted endpoint running, we can make real-time predictions from our model very easily, simply by making an http POST request. But first, we'll need to setup serializers and deserializers for passing our `test_data` NumPy arrays to the model behind the endpoint."
|
482 | 450 | ]
|
483 | 451 | },
|
484 |
| - { |
485 |
| - "cell_type": "code", |
486 |
| - "execution_count": null, |
487 |
| - "metadata": { |
488 |
| - "collapsed": true |
489 |
| - }, |
490 |
| - "outputs": [], |
491 |
| - "source": [ |
492 |
| - "xgb_predictor.content_type = 'text/csv'\n", |
493 |
| - "xgb_predictor.serializer = csv_serializer\n", |
494 |
| - "xgb_predictor.deserializer = None" |
495 |
| - ] |
496 |
| - }, |
497 | 452 | {
|
498 | 453 | "cell_type": "markdown",
|
499 | 454 | "metadata": {},
|
|
509 | 464 | {
|
510 | 465 | "cell_type": "code",
|
511 | 466 | "execution_count": null,
|
512 |
| - "metadata": { |
513 |
| - "collapsed": true |
514 |
| - }, |
| 467 | + "metadata": {}, |
515 | 468 | "outputs": [],
|
516 | 469 | "source": [
|
517 | 470 | "def predict(data, rows=500):\n",
|
|
535 | 488 | {
|
536 | 489 | "cell_type": "code",
|
537 | 490 | "execution_count": null,
|
538 |
| - "metadata": { |
539 |
| - "collapsed": true |
540 |
| - }, |
| 491 | + "metadata": {}, |
541 | 492 | "outputs": [],
|
542 | 493 | "source": [
|
543 | 494 | "pd.crosstab(index=test_data.iloc[:, 0], columns=np.round(predictions), rownames=['actual'], colnames=['predictions'])"
|
|
559 | 510 | {
|
560 | 511 | "cell_type": "code",
|
561 | 512 | "execution_count": null,
|
562 |
| - "metadata": { |
563 |
| - "collapsed": true |
564 |
| - }, |
| 513 | + "metadata": {}, |
565 | 514 | "outputs": [],
|
566 | 515 | "source": [
|
567 | 516 | "plt.hist(predictions)\n",
|
|
578 | 527 | {
|
579 | 528 | "cell_type": "code",
|
580 | 529 | "execution_count": null,
|
581 |
| - "metadata": { |
582 |
| - "collapsed": true |
583 |
| - }, |
| 530 | + "metadata": {}, |
584 | 531 | "outputs": [],
|
585 | 532 | "source": [
|
586 | 533 | "pd.crosstab(index=test_data.iloc[:, 0], columns=np.where(predictions > 0.3, 1, 0))"
|
|
629 | 576 | {
|
630 | 577 | "cell_type": "code",
|
631 | 578 | "execution_count": null,
|
632 |
| - "metadata": { |
633 |
| - "collapsed": true |
634 |
| - }, |
| 579 | + "metadata": {}, |
635 | 580 | "outputs": [],
|
636 | 581 | "source": [
|
637 | 582 | "cutoffs = np.arange(0.01, 1, 0.01)\n",
|
|
643 | 588 | "\n",
|
644 | 589 | "costs = np.array(costs)\n",
|
645 | 590 | "plt.plot(cutoffs, costs)\n",
|
646 |
| - "plt.show()\n", |
| 591 | + "plt.show()" |
| 592 | + ] |
| 593 | + }, |
| 594 | + { |
| 595 | + "cell_type": "code", |
| 596 | + "execution_count": null, |
| 597 | + "metadata": {}, |
| 598 | + "outputs": [], |
| 599 | + "source": [ |
647 | 600 | "print('Cost is minimized near a cutoff of:', cutoffs[np.argmin(costs)], 'for a cost of:', np.min(costs))"
|
648 | 601 | ]
|
649 | 602 | },
|
|
683 | 636 | {
|
684 | 637 | "cell_type": "code",
|
685 | 638 | "execution_count": null,
|
686 |
| - "metadata": { |
687 |
| - "collapsed": true |
688 |
| - }, |
| 639 | + "metadata": {}, |
689 | 640 | "outputs": [],
|
690 | 641 | "source": [
|
691 |
| - "sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)" |
| 642 | + "xgb_predictor.delete_endpoint()" |
692 | 643 | ]
|
| 644 | + }, |
| 645 | + { |
| 646 | + "cell_type": "code", |
| 647 | + "execution_count": null, |
| 648 | + "metadata": {}, |
| 649 | + "outputs": [], |
| 650 | + "source": [] |
693 | 651 | }
|
694 | 652 | ],
|
695 | 653 | "metadata": {
|
696 | 654 | "celltoolbar": "Tags",
|
697 | 655 | "kernelspec": {
|
698 |
| - "display_name": "Python 3", |
| 656 | + "display_name": "conda_python3", |
699 | 657 | "language": "python",
|
700 |
| - "name": "python3" |
| 658 | + "name": "conda_python3" |
701 | 659 | },
|
702 | 660 | "language_info": {
|
703 | 661 | "codemirror_mode": {
|
|
709 | 667 | "name": "python",
|
710 | 668 | "nbconvert_exporter": "python",
|
711 | 669 | "pygments_lexer": "ipython3",
|
712 |
| - "version": "3.7.3" |
| 670 | + "version": "3.6.10" |
713 | 671 | },
|
714 | 672 | "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
|
715 | 673 | },
|
|
0 commit comments