|
46 | 46 | "\n",
|
47 | 47 | "* In your AWS console, go to Lambda Management Console,\n",
|
48 | 48 | "* Create a new function by hitting Create Function,\n",
|
49 |
| - "* Choose the language as Python 3.7 and put in the following sample code for stopping the training job if one of the Rule statuses is `\"IssuesFound\"`:\n", |
| 49 | + "* Choose the language as Python 3.7 (or higher) and put in the following sample code for stopping the training job if one of the Rule statuses is `\"IssuesFound\"`:\n", |
50 | 50 | "\n",
|
51 | 51 | "```python\n",
|
52 | 52 | "import json\n",
|
53 | 53 | "import boto3\n",
|
54 | 54 | "import logging\n",
|
55 | 55 | "\n",
|
| 56 | + "logger = logging.getLogger()\n", |
| 57 | + "logger.setLevel(logging.INFO)\n", |
| 58 | + "\n", |
| 59 | + "\n", |
56 | 60 | "def lambda_handler(event, context):\n",
|
57 | 61 | " training_job_name = event.get(\"detail\").get(\"TrainingJobName\")\n",
|
| 62 | + " logging.info(f'Evaluating Debugger rules for training job: {training_job_name}')\n", |
| 63 | + "\n", |
58 | 64 | " eval_statuses = event.get(\"detail\").get(\"DebugRuleEvaluationStatuses\", None)\n",
|
59 | 65 | "\n",
|
60 | 66 | " if eval_statuses is None or len(eval_statuses) == 0:\n",
|
|
64 | 70 | " 'body': json.dumps('Nothing to do')\n",
|
65 | 71 | " }\n",
|
66 | 72 | "\n",
|
| 73 | + " # should only attempt stopping jobs with InProgress status\n", |
| 74 | + " training_job_status = event.get(\"detail\").get(\"TrainingJobStatus\", None)\n", |
| 75 | + " if training_job_status != 'InProgress':\n", |
| 76 | + " logging.debug(f\"Current Training job status({training_job_status}) is not 'InProgress'. Exiting\")\n", |
| 77 | + " return {\n", |
| 78 | + " 'statusCode': 200,\n", |
| 79 | + " 'body': json.dumps('Nothing to do')\n", |
| 80 | + " }\n", |
| 81 | + "\n", |
67 | 82 | " client = boto3.client('sagemaker')\n",
|
68 | 83 | "\n",
|
69 | 84 | " for status in eval_statuses:\n",
|
| 85 | + " logging.info(status.get(\"RuleEvaluationStatus\") + ', RuleEvaluationStatus=' + str(status))\n", |
70 | 86 | " if status.get(\"RuleEvaluationStatus\") == \"IssuesFound\":\n",
|
| 87 | + " secondary_status = event.get(\"detail\").get(\"SecondaryStatus\", None)\n", |
71 | 88 | " logging.info(\n",
|
72 |
| - " 'Evaluation of rule configuration {} resulted in \"IssuesFound\". '\n", |
73 |
| - " 'Attempting to stop training job {}'.format(\n", |
74 |
| - " status.get(\"RuleConfigurationName\"), training_job_name\n", |
75 |
| - " )\n", |
| 89 | + " f'About to stop training job, since evaluation of rule configuration {status.get(\"RuleConfigurationName\")} resulted in \"IssuesFound\". ' +\n", |
| 90 | + " f'\\ntraining job \"{training_job_name}\" status is \"{training_job_status}\", secondary status is \"{secondary_status}\"' +\n", |
| 91 | + " f'\\nAttempting to stop training job \"{training_job_name}\"'\n", |
76 | 92 | " )\n",
|
77 | 93 | " try:\n",
|
78 | 94 | " client.stop_training_job(\n",
|
|
90 | 106 | "```\n",
|
91 | 107 | "* Create a new execution role for the Lambda, and\n",
|
92 | 108 | "* In your IAM console, search for the role and attach \"AmazonSageMakerFullAccess\" policy to the role. This is needed for the code in your Lambda function to stop the training job.\n",
|
| 109 | + "* Basic settings > set Timeout to 30 seconds instead of 3 seconds. \n", |
93 | 110 | "\n",
|
94 | 111 | "#### Create a CloudWatch Rule\n",
|
95 | 112 | "\n",
|
|
0 commit comments