|
57 | 57 | "cell_type": "code",
|
58 | 58 | "execution_count": null,
|
59 | 59 | "metadata": {
|
| 60 | + "collapsed": true, |
60 | 61 | "isConfigCell": true,
|
61 | 62 | "tags": [
|
62 | 63 | "parameters"
|
63 | 64 | ]
|
64 | 65 | },
|
65 | 66 | "outputs": [],
|
66 | 67 | "source": [
|
67 |
| - "# Define IAM role\n", |
68 |
| - "import sagemaker\n", |
69 |
| - "import boto3\n", |
70 |
| - "import re\n", |
71 |
| - "from sagemaker import get_execution_role\n", |
| 68 | + "# Define IAM role\n", |
| 69 | + "import sagemaker\n", |
| 70 | + "import boto3\n", |
| 71 | + "import re\n", |
| 72 | + "from sagemaker import get_execution_role\n", |
| 73 | + "\n", |
| 74 | + "sess = sagemaker.Session()\n", |
| 75 | + "bucket=sess.default_bucket()", |
72 | 76 | "\n",
|
73 |
| - "sess = sagemaker.Session()\n", |
74 |
| - "bucket = sess.default_bucket()\n", |
75 |
| - "prefix = \"ntm_demo\"\n", |
76 | 77 | "role = get_execution_role()"
|
77 | 78 | ]
|
78 | 79 | },
|
|
86 | 87 | {
|
87 | 88 | "cell_type": "code",
|
88 | 89 | "execution_count": null,
|
89 |
| - "metadata": {}, |
| 90 | + "metadata": { |
| 91 | + "collapsed": true |
| 92 | + }, |
90 | 93 | "outputs": [],
|
91 | 94 | "source": [
|
92 | 95 | "import numpy as np\n",
|
|
101 | 104 | "from IPython.display import display\n",
|
102 | 105 | "import scipy\n",
|
103 | 106 | "import sagemaker.amazon.common as smac\n",
|
104 |
| - "from sagemaker.serializers import CSVSerializer\n", |
105 |
| - "from sagemaker.deserializers import JSONDeserializer" |
| 107 | + "from sagemaker.predictor import csv_serializer, json_deserializer" |
106 | 108 | ]
|
107 | 109 | },
|
108 | 110 | {
|
|
118 | 120 | {
|
119 | 121 | "cell_type": "code",
|
120 | 122 | "execution_count": null,
|
121 |
| - "metadata": {}, |
| 123 | + "metadata": { |
| 124 | + "collapsed": true |
| 125 | + }, |
122 | 126 | "outputs": [],
|
123 | 127 | "source": [
|
124 | 128 | "# generate the sample data\n",
|
125 | 129 | "num_documents = 5000\n",
|
126 | 130 | "num_topics = 5\n",
|
127 | 131 | "vocabulary_size = 25\n",
|
128 | 132 | "known_alpha, known_beta, documents, topic_mixtures = generate_griffiths_data(\n",
|
129 |
| - " num_documents=num_documents, num_topics=num_topics, vocabulary_size=vocabulary_size\n", |
130 |
| - ")\n", |
| 133 | + " num_documents=num_documents, num_topics=num_topics, vocabulary_size=vocabulary_size)\n", |
131 | 134 | "\n",
|
132 | 135 | "# separate the generated data into training and tests subsets\n",
|
133 |
| - "num_documents_training = int(0.8 * num_documents)\n", |
| 136 | + "num_documents_training = int(0.8*num_documents)\n", |
134 | 137 | "num_documents_test = num_documents - num_documents_training\n",
|
135 | 138 | "\n",
|
136 | 139 | "documents_training = documents[:num_documents_training]\n",
|
|
157 | 160 | {
|
158 | 161 | "cell_type": "code",
|
159 | 162 | "execution_count": null,
|
160 |
| - "metadata": {}, |
| 163 | + "metadata": { |
| 164 | + "collapsed": true |
| 165 | + }, |
161 | 166 | "outputs": [],
|
162 | 167 | "source": [
|
163 |
| - "print(f\"First training document = {documents[0]}\")\n", |
164 |
| - "print(f\"\\nVocabulary size = {vocabulary_size}\")" |
| 168 | + "print('First training document = {}'.format(documents[0]))\n", |
| 169 | + "print('\\nVocabulary size = {}'.format(vocabulary_size))" |
165 | 170 | ]
|
166 | 171 | },
|
167 | 172 | {
|
168 | 173 | "cell_type": "code",
|
169 | 174 | "execution_count": null,
|
170 |
| - "metadata": {}, |
| 175 | + "metadata": { |
| 176 | + "collapsed": true |
| 177 | + }, |
171 | 178 | "outputs": [],
|
172 | 179 | "source": [
|
173 | 180 | "np.set_printoptions(precision=4, suppress=True)\n",
|
174 | 181 | "\n",
|
175 |
| - "print(f\"Known topic mixture of first training document = {topic_mixtures_training[0]}\")\n", |
176 |
| - "print(f\"\\nNumber of topics = {num_topics}\")" |
| 182 | + "print('Known topic mixture of first training document = {}'.format(topic_mixtures_training[0]))\n", |
| 183 | + "print('\\nNumber of topics = {}'.format(num_topics))" |
177 | 184 | ]
|
178 | 185 | },
|
179 | 186 | {
|
|
186 | 193 | {
|
187 | 194 | "cell_type": "code",
|
188 | 195 | "execution_count": null,
|
189 |
| - "metadata": {}, |
| 196 | + "metadata": { |
| 197 | + "collapsed": true |
| 198 | + }, |
190 | 199 | "outputs": [],
|
191 | 200 | "source": [
|
192 | 201 | "%matplotlib inline\n",
|
193 | 202 | "\n",
|
194 |
| - "fig = plot_topic_data(documents_training[:10], nrows=2, ncols=5, cmap=\"gray_r\", with_colorbar=False)\n", |
195 |
| - "fig.suptitle(\"Example Documents\")\n", |
| 203 | + "fig = plot_topic_data(documents_training[:10], nrows=2, ncols=5, cmap='gray_r', with_colorbar=False)\n", |
| 204 | + "fig.suptitle('Example Documents')\n", |
196 | 205 | "fig.set_dpi(160)"
|
197 | 206 | ]
|
198 | 207 | },
|
|
210 | 219 | {
|
211 | 220 | "cell_type": "code",
|
212 | 221 | "execution_count": null,
|
213 |
| - "metadata": {}, |
| 222 | + "metadata": { |
| 223 | + "collapsed": true |
| 224 | + }, |
214 | 225 | "outputs": [],
|
215 | 226 | "source": [
|
216 | 227 | "buf = io.BytesIO()\n",
|
217 |
| - "smac.write_numpy_to_dense_tensor(buf, data_training[0].astype(\"float32\"))\n", |
| 228 | + "smac.write_numpy_to_dense_tensor(buf, data_training[0].astype('float32'))\n", |
218 | 229 | "buf.seek(0)\n",
|
219 | 230 | "\n",
|
220 |
| - "key = \"ntm.data\"\n", |
221 |
| - "boto3.resource(\"s3\").Bucket(bucket).Object(os.path.join(prefix, \"train\", key)).upload_fileobj(buf)\n", |
222 |
| - "s3_train_data = f\"s3://{bucket}/{prefix}/train/{key}\"" |
| 231 | + "key = 'ntm.data'\n", |
| 232 | + "boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)\n", |
| 233 | + "s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)" |
223 | 234 | ]
|
224 | 235 | },
|
225 | 236 | {
|
|
236 | 247 | {
|
237 | 248 | "cell_type": "code",
|
238 | 249 | "execution_count": null,
|
239 |
| - "metadata": {}, |
| 250 | + "metadata": { |
| 251 | + "collapsed": true |
| 252 | + }, |
240 | 253 | "outputs": [],
|
241 | 254 | "source": [
|
242 | 255 | "from sagemaker.amazon.amazon_estimator import get_image_uri\n",
|
243 |
| - "\n", |
244 |
| - "container = get_image_uri(boto3.Session().region_name, \"ntm\")" |
| 256 | + "container = get_image_uri(boto3.Session().region_name, 'ntm')" |
245 | 257 | ]
|
246 | 258 | },
|
247 | 259 | {
|
|
260 | 272 | {
|
261 | 273 | "cell_type": "code",
|
262 | 274 | "execution_count": null,
|
263 |
| - "metadata": {}, |
| 275 | + "metadata": { |
| 276 | + "collapsed": true |
| 277 | + }, |
264 | 278 | "outputs": [],
|
265 | 279 | "source": [
|
266 | 280 | "sess = sagemaker.Session()\n",
|
267 | 281 | "\n",
|
268 |
| - "ntm = sagemaker.estimator.Estimator(\n", |
269 |
| - " container,\n", |
270 |
| - " role,\n", |
271 |
| - " train_instance_count=1,\n", |
272 |
| - " train_instance_type=\"ml.c4.xlarge\",\n", |
273 |
| - " output_path=f\"s3://{bucket}/{prefix}/output\",\n", |
274 |
| - " sagemaker_session=sess,\n", |
275 |
| - ")\n", |
276 |
| - "ntm.set_hyperparameters(num_topics=num_topics, feature_dim=vocabulary_size)\n", |
| 282 | + "ntm = sagemaker.estimator.Estimator(container,\n", |
| 283 | + " role, \n", |
| 284 | + " train_instance_count=1, \n", |
| 285 | + " train_instance_type='ml.c4.xlarge',\n", |
| 286 | + " output_path='s3://{}/{}/output'.format(bucket, prefix),\n", |
| 287 | + " sagemaker_session=sess)\n", |
| 288 | + "ntm.set_hyperparameters(num_topics=num_topics,\n", |
| 289 | + " feature_dim=vocabulary_size)\n", |
277 | 290 | "\n",
|
278 |
| - "ntm.fit({\"train\": s3_train_data})" |
| 291 | + "ntm.fit({'train': s3_train_data})" |
279 | 292 | ]
|
280 | 293 | },
|
281 | 294 | {
|
|
294 | 307 | {
|
295 | 308 | "cell_type": "code",
|
296 | 309 | "execution_count": null,
|
297 |
| - "metadata": {}, |
| 310 | + "metadata": { |
| 311 | + "collapsed": true |
| 312 | + }, |
298 | 313 | "outputs": [],
|
299 | 314 | "source": [
|
300 |
| - "ntm_predictor = ntm.deploy(initial_instance_count=1, instance_type=\"ml.m4.xlarge\")" |
| 315 | + "ntm_predictor = ntm.deploy(initial_instance_count=1,\n", |
| 316 | + " instance_type='ml.m4.xlarge')" |
301 | 317 | ]
|
302 | 318 | },
|
303 | 319 | {
|
|
322 | 338 | {
|
323 | 339 | "cell_type": "code",
|
324 | 340 | "execution_count": null,
|
325 |
| - "metadata": {}, |
| 341 | + "metadata": { |
| 342 | + "collapsed": true |
| 343 | + }, |
326 | 344 | "outputs": [],
|
327 | 345 | "source": [
|
328 |
| - "ntm_predictor.serializer = CSVSerializer()\n", |
329 |
| - "ntm_predictor.deserializer = JSONDeserializer()" |
| 346 | + "ntm_predictor.content_type = 'text/csv'\n", |
| 347 | + "ntm_predictor.serializer = csv_serializer\n", |
| 348 | + "ntm_predictor.deserializer = json_deserializer" |
330 | 349 | ]
|
331 | 350 | },
|
332 | 351 | {
|
|
339 | 358 | {
|
340 | 359 | "cell_type": "code",
|
341 | 360 | "execution_count": null,
|
342 |
| - "metadata": {}, |
| 361 | + "metadata": { |
| 362 | + "collapsed": true |
| 363 | + }, |
343 | 364 | "outputs": [],
|
344 | 365 | "source": [
|
345 |
| - "results = ntm_predictor.predict(documents_training[:10], initial_args={\"ContentType\": \"text/csv\"})\n", |
| 366 | + "results = ntm_predictor.predict(documents_training[:10])\n", |
346 | 367 | "print(results)"
|
347 | 368 | ]
|
348 | 369 | },
|
|
369 | 390 | {
|
370 | 391 | "cell_type": "code",
|
371 | 392 | "execution_count": null,
|
372 |
| - "metadata": {}, |
| 393 | + "metadata": { |
| 394 | + "collapsed": true |
| 395 | + }, |
373 | 396 | "outputs": [],
|
374 | 397 | "source": [
|
375 |
| - "predictions = np.array([prediction[\"topic_weights\"] for prediction in results[\"predictions\"]])\n", |
| 398 | + "predictions = np.array([prediction['topic_weights'] for prediction in results['predictions']])\n", |
376 | 399 | "\n",
|
377 | 400 | "print(predictions)"
|
378 | 401 | ]
|
|
387 | 410 | {
|
388 | 411 | "cell_type": "code",
|
389 | 412 | "execution_count": null,
|
390 |
| - "metadata": {}, |
| 413 | + "metadata": { |
| 414 | + "collapsed": true |
| 415 | + }, |
391 | 416 | "outputs": [],
|
392 | 417 | "source": [
|
393 | 418 | "print(topic_mixtures_training[0]) # known topic mixture\n",
|
|
406 | 431 | {
|
407 | 432 | "cell_type": "code",
|
408 | 433 | "execution_count": null,
|
409 |
| - "metadata": {}, |
| 434 | + "metadata": { |
| 435 | + "collapsed": true |
| 436 | + }, |
410 | 437 | "outputs": [],
|
411 | 438 | "source": [
|
412 | 439 | "def predict_batches(data, rows=1000):\n",
|
413 | 440 | " split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))\n",
|
414 | 441 | " predictions = []\n",
|
415 | 442 | " for array in split_array:\n",
|
416 |
| - " results = ntm_predictor.predict(array, initial_args={\"ContentType\": \"text/csv\"})\n", |
417 |
| - " predictions += [r[\"topic_weights\"] for r in results[\"predictions\"]]\n", |
| 443 | + " results = ntm_predictor.predict(array)\n", |
| 444 | + " predictions += [r['topic_weights'] for r in results['predictions']]\n", |
418 | 445 | " return np.array(predictions)"
|
419 | 446 | ]
|
420 | 447 | },
|
421 | 448 | {
|
422 | 449 | "cell_type": "code",
|
423 | 450 | "execution_count": null,
|
424 |
| - "metadata": {}, |
| 451 | + "metadata": { |
| 452 | + "collapsed": true |
| 453 | + }, |
425 | 454 | "outputs": [],
|
426 | 455 | "source": [
|
427 | 456 | "predictions = predict_batches(documents_training)"
|
|
437 | 466 | {
|
438 | 467 | "cell_type": "code",
|
439 | 468 | "execution_count": null,
|
440 |
| - "metadata": {}, |
| 469 | + "metadata": { |
| 470 | + "collapsed": true |
| 471 | + }, |
441 | 472 | "outputs": [],
|
442 | 473 | "source": [
|
443 |
| - "data = pd.DataFrame(\n", |
444 |
| - " np.concatenate([topic_mixtures_training, predictions], axis=1),\n", |
445 |
| - " columns=[f\"actual_{i}\" for i in range(5)] + [f\"predictions_{i}\" for i in range(5)],\n", |
446 |
| - ")\n", |
| 474 | + "data = pd.DataFrame(np.concatenate([topic_mixtures_training, predictions], axis=1), \n", |
| 475 | + " columns=['actual_{}'.format(i) for i in range(5)] + ['predictions_{}'.format(i) for i in range(5)])\n", |
447 | 476 | "display(data.corr())\n",
|
448 |
| - "pd.plotting.scatter_matrix(\n", |
449 |
| - " pd.DataFrame(np.concatenate([topic_mixtures_training, predictions], axis=1)), figsize=(12, 12)\n", |
450 |
| - ")\n", |
| 477 | + "pd.plotting.scatter_matrix(pd.DataFrame(np.concatenate([topic_mixtures_training, predictions], axis=1)), figsize=(12, 12))\n", |
451 | 478 | "plt.show()"
|
452 | 479 | ]
|
453 | 480 | },
|
|
478 | 505 | "cell_type": "code",
|
479 | 506 | "execution_count": null,
|
480 | 507 | "metadata": {
|
481 |
| - "jupyter": { |
482 |
| - "source_hidden": true |
483 |
| - } |
| 508 | + "collapsed": true |
484 | 509 | },
|
485 | 510 | "outputs": [],
|
486 | 511 | "source": [
|
|
501 | 526 | ],
|
502 | 527 | "metadata": {
|
503 | 528 | "celltoolbar": "Tags",
|
504 |
| - "instance_type": "ml.t3.medium", |
505 | 529 | "kernelspec": {
|
506 |
| - "display_name": "Python 3 (Data Science)", |
| 530 | + "display_name": "Environment (conda_python3)", |
507 | 531 | "language": "python",
|
508 |
| - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0" |
| 532 | + "name": "conda_python3" |
509 | 533 | },
|
510 | 534 | "language_info": {
|
511 | 535 | "codemirror_mode": {
|
|
517 | 541 | "name": "python",
|
518 | 542 | "nbconvert_exporter": "python",
|
519 | 543 | "pygments_lexer": "ipython3",
|
520 |
| - "version": "3.7.6" |
| 544 | + "version": "3.6.3" |
521 | 545 | },
|
522 | 546 | "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
|
523 | 547 | },
|
524 | 548 | "nbformat": 4,
|
525 |
| - "nbformat_minor": 4 |
| 549 | + "nbformat_minor": 2 |
526 | 550 | }
|
0 commit comments