|
62 | 62 | "bucket='<bucket-name>'"
|
63 | 63 | ]
|
64 | 64 | },
|
| 65 | + { |
| 66 | + "cell_type": "code", |
| 67 | + "execution_count": null, |
| 68 | + "metadata": {}, |
| 69 | + "outputs": [], |
| 70 | + "source": [ |
| 71 | + "data_key = 'kmeans_example/data'\n", |
| 72 | + "data_location = 's3://{}/{}'.format(bucket, data_key)\n", |
| 73 | + "output_location = 's3://{}/kmeans_example/output'.format(bucket)\n", |
| 74 | + "\n", |
| 75 | + "print('training data will be uploaded to: {}'.format(data_location))\n", |
| 76 | + "print('training artifacts will be uploaded to: {}'.format(output_location))" |
| 77 | + ] |
| 78 | + }, |
65 | 79 | {
|
66 | 80 | "cell_type": "markdown",
|
67 | 81 | "metadata": {},
|
|
121 | 135 | "cell_type": "markdown",
|
122 | 136 | "metadata": {},
|
123 | 137 | "source": [
|
124 |
| - "### Data conversion\n", |
| 138 | + "### Data conversion and upload\n", |
125 | 139 | "\n",
|
126 | 140 | "Since algorithms have particular input and output requirements, converting the dataset is also part of the process that a data scientist goes through prior to initiating training. In this particular case, the hosted implementation of k-means takes recordio-wrapped protobuf, where the data we have today is a pickle-ized numpy array on disk.\n",
|
127 | 141 | "\n",
|
|
140 | 154 | "%%time\n",
|
141 | 155 | "from sagemaker.amazon.common import write_numpy_to_dense_tensor\n",
|
142 | 156 | "import io\n",
|
| 157 | + "import boto3\n", |
143 | 158 | "\n",
|
144 | 159 | "# Convert the training data into the format required by the SageMaker KMeans algorithm\n",
|
145 | 160 | "buf = io.BytesIO()\n",
|
146 | 161 | "write_numpy_to_dense_tensor(buf, train_set[0], train_set[1])\n",
|
147 |
| - "buf.seek(0)" |
148 |
| - ] |
149 |
| - }, |
150 |
| - { |
151 |
| - "cell_type": "code", |
152 |
| - "execution_count": null, |
153 |
| - "metadata": {}, |
154 |
| - "outputs": [], |
155 |
| - "source": [ |
156 |
| - "%%time\n", |
157 |
| - "\n", |
158 |
| - "import boto3\n", |
| 162 | + "buf.seek(0)\n", |
159 | 163 | "\n",
|
160 |
| - "key = 'kmeans_lowlevel_example/data'\n", |
161 |
| - "boto3.resource('s3').Bucket(bucket).Object(key).upload_fileobj(buf)\n", |
162 |
| - "s3_train_data = 's3://{}/{}'.format(bucket, key)\n", |
163 |
| - "print('uploaded training data location: {}'.format(s3_train_data))" |
| 164 | + "boto3.resource('s3').Bucket(bucket).Object(data_key).upload_fileobj(buf)" |
164 | 165 | ]
|
165 | 166 | },
|
166 | 167 | {
|
|
201 | 202 | " },\n",
|
202 | 203 | " \"RoleArn\": role,\n",
|
203 | 204 | " \"OutputDataConfig\": {\n",
|
204 |
| - " \"S3OutputPath\": \"s3://{}/kmeans_lowlevel_example/output\".format(bucket)\n", |
| 205 | + " \"S3OutputPath\": output_location\n", |
205 | 206 | " },\n",
|
206 | 207 | " \"ResourceConfig\": {\n",
|
207 | 208 | " \"InstanceCount\": 2,\n",
|
|
224 | 225 | " \"DataSource\": {\n",
|
225 | 226 | " \"S3DataSource\": {\n",
|
226 | 227 | " \"S3DataType\": \"S3Prefix\",\n",
|
227 |
| - " \"S3Uri\": s3_train_data,\n", |
| 228 | + " \"S3Uri\": data_location,\n", |
228 | 229 | " \"S3DataDistributionType\": \"FullyReplicated\"\n",
|
229 | 230 | " }\n",
|
230 | 231 | " },\n",
|
|
0 commit comments