Skip to content

enclosing get_waiter for training within try-catch #119

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 29, 2017
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"isConfigCell": true
},
"outputs": [],
Expand All @@ -69,10 +68,10 @@
"assumed_role = boto3.client('sts').get_caller_identity()['Arn']\n",
"role = re.sub(r'^(.+)sts::(\\d+):assumed-role/(.+?)/.*$', r'\\1iam::\\2:role/\\3', assumed_role)\n",
"\n",
"kms_key_id = '<your_kms_key_arn_here>'\n",
"kms_key_id = '<your-kms-key-id>'\n",
"\n",
"bucket='<your_s3_bucket_name_here>' # put your s3 bucket name here, and create s3 bucket\n",
"prefix = 'sagemarker/kms-new'\n",
"bucket='<s3-bucket>' # put your s3 bucket name here, and create s3 bucket\n",
"prefix = 'sagemaker/kms'\n",
"# customize to your bucket where you have stored the data\n",
"bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)"
]
Expand All @@ -90,17 +89,6 @@
"We, first, read the dataset from an existing repository into memory. This processing could be done *in situ* by Amazon Athena, Apache Spark in Amazon EMR, Amazon Redshift, etc., assuming the dataset is present in the appropriate location. Then, the next step would be to transfer the data to S3 for use in training. For small datasets, such as the one used below, reading into memory isn't onerous, though it would be for larger datasets."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!conda install -y -c conda-forge scikit-learn"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -189,9 +177,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"s3 = boto3.client('s3')\n",
Expand Down Expand Up @@ -248,9 +234,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
Expand Down Expand Up @@ -299,7 +283,7 @@
" \"S3DataDistributionType\": \"FullyReplicated\"\n",
" }\n",
" },\n",
" \"ContentType\": \"libsvm\",\n",
" \"ContentType\": \"csv\",\n",
" \"CompressionType\": \"None\"\n",
" },\n",
" {\n",
Expand All @@ -320,12 +304,17 @@
"client = boto3.client('sagemaker')\n",
"client.create_training_job(**create_training_params)\n",
"\n",
"status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']\n",
"print(status)\n",
"while status !='Completed' and status!='Failed':\n",
" time.sleep(60)\n",
" status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']\n",
" print(status)"
"try:\n",
" # wait for the job to finish and report the ending status\n",
" client.get_waiter('TrainingJob_Created').wait(TrainingJobName=job_name)\n",
" training_info = client.describe_training_job(TrainingJobName=job_name)\n",
" status = training_info['TrainingJobStatus']\n",
" print(\"Training job ended with status: \" + status)\n",
"except:\n",
" print('Training failed to start')\n",
" # if exception is raised, that means it has failed\n",
" message = client.describe_training_job(TrainingJobName=job_name)['FailureReason']\n",
" print('Training failed with the following error: {}'.format(message))"
]
},
{
Expand All @@ -343,9 +332,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
Expand Down Expand Up @@ -384,9 +371,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"from time import gmtime, strftime\n",
Expand Down Expand Up @@ -416,9 +401,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
Expand All @@ -431,18 +414,26 @@
" EndpointConfigName=endpoint_config_name)\n",
"print(create_endpoint_response['EndpointArn'])\n",
"\n",
"resp = client.describe_endpoint(EndpointName=endpoint_name)\n",
"status = resp['EndpointStatus']\n",
"print(\"Status: \" + status)\n",
"\n",
"while status=='Creating':\n",
" time.sleep(60)\n",
" resp = client.describe_endpoint(EndpointName=endpoint_name)\n",
" status = resp['EndpointStatus']\n",
" print(\"Status: \" + status)\n",
"print('EndpointArn = {}'.format(create_endpoint_response['EndpointArn']))\n",
"\n",
"# get the status of the endpoint\n",
"response = client.describe_endpoint(EndpointName=endpoint_name)\n",
"status = response['EndpointStatus']\n",
"print('EndpointStatus = {}'.format(status))\n",
"\n",
"\n",
"print(\"Arn: \" + resp['EndpointArn'])\n",
"print(\"Status: \" + status)"
"# wait until the status has changed\n",
"client.get_waiter('Endpoint_Created').wait(EndpointName=endpoint_name)\n",
"\n",
"\n",
"# print the status of the endpoint\n",
"endpoint_response = client.describe_endpoint(EndpointName=endpoint_name)\n",
"status = endpoint_response['EndpointStatus']\n",
"print('Endpoint creation ended with EndpointStatus = {}'.format(status))\n",
"\n",
"if status != 'InService':\n",
" raise Exception('Endpoint creation failed.')"
]
},
{
Expand Down Expand Up @@ -508,9 +499,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
Expand Down Expand Up @@ -542,12 +531,10 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"#client.delete_endpoint(EndpointName=endpoint_name)"
"# client.delete_endpoint(EndpointName=endpoint_name)"
]
}
],
Expand Down