Skip to content

Revert "Fixed notebooks for errors due to syntax change and cleaned notebooks - NTM" #1730

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 11, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 97 additions & 73 deletions introduction_to_amazon_algorithms/ntm_synthetic/ntm_synthetic.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -57,22 +57,23 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"isConfigCell": true,
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Define IAM role\n",
"import sagemaker\n",
"import boto3\n",
"import re\n",
"from sagemaker import get_execution_role\n",
"# Define IAM role\n",
"import sagemaker\n",
"import boto3\n",
"import re\n",
"from sagemaker import get_execution_role\n",
"\n",
"sess = sagemaker.Session()\n",
"bucket=sess.default_bucket()",
"\n",
"sess = sagemaker.Session()\n",
"bucket = sess.default_bucket()\n",
"prefix = \"ntm_demo\"\n",
"role = get_execution_role()"
]
},
Expand All @@ -86,7 +87,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
Expand All @@ -101,8 +104,7 @@
"from IPython.display import display\n",
"import scipy\n",
"import sagemaker.amazon.common as smac\n",
"from sagemaker.serializers import CSVSerializer\n",
"from sagemaker.deserializers import JSONDeserializer"
"from sagemaker.predictor import csv_serializer, json_deserializer"
]
},
{
Expand All @@ -118,19 +120,20 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# generate the sample data\n",
"num_documents = 5000\n",
"num_topics = 5\n",
"vocabulary_size = 25\n",
"known_alpha, known_beta, documents, topic_mixtures = generate_griffiths_data(\n",
" num_documents=num_documents, num_topics=num_topics, vocabulary_size=vocabulary_size\n",
")\n",
" num_documents=num_documents, num_topics=num_topics, vocabulary_size=vocabulary_size)\n",
"\n",
"# separate the generated data into training and tests subsets\n",
"num_documents_training = int(0.8 * num_documents)\n",
"num_documents_training = int(0.8*num_documents)\n",
"num_documents_test = num_documents - num_documents_training\n",
"\n",
"documents_training = documents[:num_documents_training]\n",
Expand All @@ -157,23 +160,27 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"print(f\"First training document = {documents[0]}\")\n",
"print(f\"\\nVocabulary size = {vocabulary_size}\")"
"print('First training document = {}'.format(documents[0]))\n",
"print('\\nVocabulary size = {}'.format(vocabulary_size))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"np.set_printoptions(precision=4, suppress=True)\n",
"\n",
"print(f\"Known topic mixture of first training document = {topic_mixtures_training[0]}\")\n",
"print(f\"\\nNumber of topics = {num_topics}\")"
"print('Known topic mixture of first training document = {}'.format(topic_mixtures_training[0]))\n",
"print('\\nNumber of topics = {}'.format(num_topics))"
]
},
{
Expand All @@ -186,13 +193,15 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"\n",
"fig = plot_topic_data(documents_training[:10], nrows=2, ncols=5, cmap=\"gray_r\", with_colorbar=False)\n",
"fig.suptitle(\"Example Documents\")\n",
"fig = plot_topic_data(documents_training[:10], nrows=2, ncols=5, cmap='gray_r', with_colorbar=False)\n",
"fig.suptitle('Example Documents')\n",
"fig.set_dpi(160)"
]
},
Expand All @@ -210,16 +219,18 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"buf = io.BytesIO()\n",
"smac.write_numpy_to_dense_tensor(buf, data_training[0].astype(\"float32\"))\n",
"smac.write_numpy_to_dense_tensor(buf, data_training[0].astype('float32'))\n",
"buf.seek(0)\n",
"\n",
"key = \"ntm.data\"\n",
"boto3.resource(\"s3\").Bucket(bucket).Object(os.path.join(prefix, \"train\", key)).upload_fileobj(buf)\n",
"s3_train_data = f\"s3://{bucket}/{prefix}/train/{key}\""
"key = 'ntm.data'\n",
"boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)\n",
"s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)"
]
},
{
Expand All @@ -236,12 +247,13 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sagemaker.amazon.amazon_estimator import get_image_uri\n",
"\n",
"container = get_image_uri(boto3.Session().region_name, \"ntm\")"
"container = get_image_uri(boto3.Session().region_name, 'ntm')"
]
},
{
Expand All @@ -260,22 +272,23 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sess = sagemaker.Session()\n",
"\n",
"ntm = sagemaker.estimator.Estimator(\n",
" container,\n",
" role,\n",
" train_instance_count=1,\n",
" train_instance_type=\"ml.c4.xlarge\",\n",
" output_path=f\"s3://{bucket}/{prefix}/output\",\n",
" sagemaker_session=sess,\n",
")\n",
"ntm.set_hyperparameters(num_topics=num_topics, feature_dim=vocabulary_size)\n",
"ntm = sagemaker.estimator.Estimator(container,\n",
" role, \n",
" train_instance_count=1, \n",
" train_instance_type='ml.c4.xlarge',\n",
" output_path='s3://{}/{}/output'.format(bucket, prefix),\n",
" sagemaker_session=sess)\n",
"ntm.set_hyperparameters(num_topics=num_topics,\n",
" feature_dim=vocabulary_size)\n",
"\n",
"ntm.fit({\"train\": s3_train_data})"
"ntm.fit({'train': s3_train_data})"
]
},
{
Expand All @@ -294,10 +307,13 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ntm_predictor = ntm.deploy(initial_instance_count=1, instance_type=\"ml.m4.xlarge\")"
"ntm_predictor = ntm.deploy(initial_instance_count=1,\n",
" instance_type='ml.m4.xlarge')"
]
},
{
Expand All @@ -322,11 +338,14 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ntm_predictor.serializer = CSVSerializer()\n",
"ntm_predictor.deserializer = JSONDeserializer()"
"ntm_predictor.content_type = 'text/csv'\n",
"ntm_predictor.serializer = csv_serializer\n",
"ntm_predictor.deserializer = json_deserializer"
]
},
{
Expand All @@ -339,10 +358,12 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"results = ntm_predictor.predict(documents_training[:10], initial_args={\"ContentType\": \"text/csv\"})\n",
"results = ntm_predictor.predict(documents_training[:10])\n",
"print(results)"
]
},
Expand All @@ -369,10 +390,12 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"predictions = np.array([prediction[\"topic_weights\"] for prediction in results[\"predictions\"]])\n",
"predictions = np.array([prediction['topic_weights'] for prediction in results['predictions']])\n",
"\n",
"print(predictions)"
]
Expand All @@ -387,7 +410,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"print(topic_mixtures_training[0]) # known topic mixture\n",
Expand All @@ -406,22 +431,26 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def predict_batches(data, rows=1000):\n",
" split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))\n",
" predictions = []\n",
" for array in split_array:\n",
" results = ntm_predictor.predict(array, initial_args={\"ContentType\": \"text/csv\"})\n",
" predictions += [r[\"topic_weights\"] for r in results[\"predictions\"]]\n",
" results = ntm_predictor.predict(array)\n",
" predictions += [r['topic_weights'] for r in results['predictions']]\n",
" return np.array(predictions)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"predictions = predict_batches(documents_training)"
Expand All @@ -437,17 +466,15 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data = pd.DataFrame(\n",
" np.concatenate([topic_mixtures_training, predictions], axis=1),\n",
" columns=[f\"actual_{i}\" for i in range(5)] + [f\"predictions_{i}\" for i in range(5)],\n",
")\n",
"data = pd.DataFrame(np.concatenate([topic_mixtures_training, predictions], axis=1), \n",
" columns=['actual_{}'.format(i) for i in range(5)] + ['predictions_{}'.format(i) for i in range(5)])\n",
"display(data.corr())\n",
"pd.plotting.scatter_matrix(\n",
" pd.DataFrame(np.concatenate([topic_mixtures_training, predictions], axis=1)), figsize=(12, 12)\n",
")\n",
"pd.plotting.scatter_matrix(pd.DataFrame(np.concatenate([topic_mixtures_training, predictions], axis=1)), figsize=(12, 12))\n",
"plt.show()"
]
},
Expand Down Expand Up @@ -478,9 +505,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"jupyter": {
"source_hidden": true
}
"collapsed": true
},
"outputs": [],
"source": [
Expand All @@ -501,11 +526,10 @@
],
"metadata": {
"celltoolbar": "Tags",
"instance_type": "ml.t3.medium",
"kernelspec": {
"display_name": "Python 3 (Data Science)",
"display_name": "Environment (conda_python3)",
"language": "python",
"name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0"
"name": "conda_python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -517,10 +541,10 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
"version": "3.6.3"
},
"notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
},
"nbformat": 4,
"nbformat_minor": 4
"nbformat_minor": 2
}