Support to get latest monitoring execution processing logs

Keshav Chandak · Keshav Chandak · commit 090225fea8a2 · 2023-07-26T18:37:41.000Z
diff --git a/src/sagemaker/model_monitor/clarify_model_monitoring.py b/src/sagemaker/model_monitor/clarify_model_monitoring.py
@@ -27,6 +27,7 @@
 from sagemaker.session import Session
 from sagemaker.utils import name_from_base
 from sagemaker.clarify import SageMakerClarifyProcessor, ModelPredictedLabelConfig
+from sagemaker.lineage._utils import get_resource_name_from_arn
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -154,6 +155,26 @@ def list_executions(self):
             for execution in executions
         ]
 
+    def get_latest_execution_logs(self, wait=False):
+        """Get the processing job logs for the most recent monitoring execution
+
+         Args:
+            wait (bool): Whether the call should wait until the job completes (default: False).
+
+        Returns: None
+        """
+        monitoring_executions = self.sagemaker_session.list_monitoring_executions(
+            monitoring_schedule_name=self.monitoring_schedule_name
+        )
+        if len(monitoring_executions["MonitoringExecutionSummaries"]) == 0:
+            raise ValueError("No execution jobs were kicked off.")
+        if "ProcessingJobArn" not in monitoring_executions["MonitoringExecutionSummaries"][0]:
+            raise ValueError("Processing Job did not run for the last execution")
+        job_arn = monitoring_executions["MonitoringExecutionSummaries"][0]["ProcessingJobArn"]
+        self.sagemaker_session.logs_for_processing_job(
+            job_name=get_resource_name_from_arn(job_arn), wait=wait
+        )
+
     def _create_baselining_processor(self):
         """Create and return a SageMakerClarifyProcessor object which will run the baselining job.
 
diff --git a/src/sagemaker/model_monitor/model_monitoring.py b/src/sagemaker/model_monitor/model_monitoring.py
@@ -63,6 +63,7 @@
     resolve_value_from_config,
     resolve_class_attribute_from_config,
 )
+from sagemaker.lineage._utils import get_resource_name_from_arn
 
 DEFAULT_REPOSITORY_NAME = "sagemaker-model-monitor-analyzer"
 
@@ -768,6 +769,26 @@ def list_executions(self):
 
         return monitoring_executions
 
+    def get_latest_execution_logs(self, wait=False):
+        """Get the processing job logs for the most recent monitoring execution
+
+         Args:
+            wait (bool): Whether the call should wait until the job completes (default: False).
+
+        Returns: None
+        """
+        monitoring_executions = self.sagemaker_session.list_monitoring_executions(
+            monitoring_schedule_name=self.monitoring_schedule_name
+        )
+        if len(monitoring_executions["MonitoringExecutionSummaries"]) == 0:
+            raise ValueError("No execution jobs were kicked off.")
+        if "ProcessingJobArn" not in monitoring_executions["MonitoringExecutionSummaries"][0]:
+            raise ValueError("Processing Job did not run for the last execution")
+        job_arn = monitoring_executions["MonitoringExecutionSummaries"][0]["ProcessingJobArn"]
+        self.sagemaker_session.logs_for_processing_job(
+            job_name=get_resource_name_from_arn(job_arn), wait=wait
+        )
+
     def update_monitoring_alert(
         self,
         monitoring_alert_name: str,
diff --git a/tests/integ/test_clarify_model_monitor.py b/tests/integ/test_clarify_model_monitor.py
@@ -293,9 +293,9 @@ def test_bias_monitor(sagemaker_session, scheduled_bias_monitor, endpoint_name,
 )
 @pytest.mark.flaky(reruns=5, reruns_delay=2)
 def test_run_bias_monitor(
-    scheduled_bias_monitor, sagemaker_session, endpoint_name, ground_truth_input, upload_actual_data
+    scheduled_bias_monitor, sagemaker_session, endpoint_name, ground_truth_input, capfd
 ):
-    _verify_execution_status(scheduled_bias_monitor)
+    _verify_execution_status(scheduled_bias_monitor, capfd=capfd)
 
     _verify_bias_job_description(
         sagemaker_session=sagemaker_session,
@@ -408,8 +408,9 @@ def test_run_explainability_monitor(
     endpoint_name,
     ground_truth_input,
     upload_actual_data,
+    capfd,
 ):
-    _verify_execution_status(scheduled_explainability_monitor)
+    _verify_execution_status(scheduled_explainability_monitor, capfd=capfd)
 
     _verify_explainability_job_description(
         sagemaker_session=sagemaker_session,
@@ -514,11 +515,12 @@ def _verify_job_description(
     )
 
 
-def _verify_execution_status(monitor):
+def _verify_execution_status(monitor, capfd):
     _wait_for_completion(monitor)
     executions = monitor.list_executions()
     assert len(executions) > 0
     schedule_desc = monitor.describe_schedule()
+    _check_processing_logs_generated(schedule_description=schedule_desc, capfd=capfd)
     execution_summary = schedule_desc.get("LastMonitoringExecutionSummary")
     last_execution_status = execution_summary["MonitoringExecutionStatus"]
     assert last_execution_status in ["Completed", "CompletedWithViolations"]
@@ -600,3 +602,10 @@ def _wait_for_completion(monitor):
         # End this loop once the execution has reached a terminal state.
         if last_execution_status in ["Completed", "CompletedWithViolations", "Failed", "Stopped"]:
             break
+
+
+def _check_processing_logs_generated(self, schedule_description, capfd):
+    self.get_latest_execution_logs(wait=False)
+    out, _ = capfd.readouterr()
+    assert len(out) > 0
+    assert schedule_description.get("LastMonitoringExecutionSummary")["ProcessingJobArn"] in out
diff --git a/tests/integ/test_model_monitor.py b/tests/integ/test_model_monitor.py
@@ -486,6 +486,44 @@ def test_default_monitor_suggest_baseline_and_create_monitoring_schedule_with_cu
     assert len(summary["MonitoringScheduleSummaries"]) > 0
 
 
+def test_default_monitor_display_logs_errors(sagemaker_session, predictor, capfd):
+    my_default_monitor = DefaultModelMonitor(role=ROLE, sagemaker_session=sagemaker_session)
+
+    my_default_monitor.create_monitoring_schedule(
+        endpoint_input=predictor.endpoint_name,
+        schedule_cron_expression=CronExpressionGenerator.hourly(),
+    )
+
+    schedule_description = my_default_monitor.describe_schedule()
+    try:
+        my_default_monitor.get_latest_execution_logs(wait=False)
+    except ValueError as ve:
+        assert "No execution jobs were kicked off." in str(ve)
+
+    for _ in retries(
+        max_retry_count=100,
+        exception_message_prefix="Waiting for the an execution to start",
+        seconds_to_sleep=50,
+    ):
+        schedule_desc = my_default_monitor.describe_schedule()
+        execution_summary = schedule_desc.get("LastMonitoringExecutionSummary")
+        last_execution_status = None
+
+        # Once there is an execution, get its status
+        if execution_summary is not None:
+            last_execution_status = execution_summary["MonitoringExecutionStatus"]
+        # End this loop once the execution has reached a terminal state.
+        if last_execution_status is not None:
+            break
+    try:
+        my_default_monitor.get_latest_execution_logs(wait=False)
+    except ValueError as ve:
+        assert "Processing Job did not run for the last execution" in str(ve)
+
+    my_default_monitor.stop_monitoring_schedule()
+    my_default_monitor.delete_monitoring_schedule()
+
+
 @pytest.mark.skipif(
     tests.integ.test_region() in tests.integ.NO_MODEL_MONITORING_REGIONS,
     reason="ModelMonitoring is not yet supported in this region.",
@@ -1643,6 +1681,7 @@ def test_byoc_monitor_attach_followed_by_baseline_and_update_monitoring_schedule
     output_kms_key,
     updated_volume_kms_key,
     updated_output_kms_key,
+    capfd,
 ):
     baseline_dataset = os.path.join(DATA_DIR, "monitor/baseline_dataset.csv")
 
@@ -1771,6 +1810,10 @@ def test_byoc_monitor_attach_followed_by_baseline_and_update_monitoring_schedule
 
     _wait_for_schedule_changes_to_apply(monitor=my_attached_monitor)
 
+    _check_processing_logs_generated(
+        monitor=my_attached_monitor, schedule_description=schedule_description, capfd=capfd
+    )
+
     my_attached_monitor.stop_monitoring_schedule()
 
     _wait_for_schedule_changes_to_apply(monitor=my_attached_monitor)
@@ -1877,6 +1920,13 @@ def test_default_monitor_monitoring_alerts(sagemaker_session, predictor):
     my_default_monitor.delete_monitoring_schedule()
 
 
+def _check_processing_logs_generated(monitor, schedule_description, capfd):
+    monitor.get_latest_execution_logs(wait=False)
+    out, _ = capfd.readouterr()
+    assert len(out) > 0
+    assert schedule_description.get("LastMonitoringExecutionSummary")["ProcessingJobArn"] in out
+
+
 def _wait_for_schedule_changes_to_apply(monitor):
     """Waits for the monitor to no longer be in the 'Pending' state. Updates take under a minute
     to apply.
diff --git a/tests/unit/sagemaker/monitor/test_clarify_model_monitor.py b/tests/unit/sagemaker/monitor/test_clarify_model_monitor.py
@@ -221,6 +221,14 @@
     "NetworkConfig": NETWORK_CONFIG._to_request_dict(),
 }
 
+MONITORING_EXECUTIONS_EMPTY = {
+    "MonitoringExecutionSummaries": [],
+}
+
+MONITORING_EXECUTIONS_NO_PROCESSING_JOB = {
+    "MonitoringExecutionSummaries": [{"MonitoringSchedule": "MonitoringSchedule"}],
+}
+
 # For update API
 NEW_ROLE_ARN = "arn:aws:iam::012345678902:role/{}".format(ROLE)
 NEW_INSTANCE_COUNT = 2
@@ -1716,3 +1724,20 @@ def _test_model_explainability_monitor_delete_schedule(
     sagemaker_session.sagemaker_client.delete_model_explainability_job_definition.assert_called_once_with(
         JobDefinitionName=job_definition_name
     )
+
+
+def test_model_explainability_monitor_logs_failure(model_explainability_monitor, sagemaker_session):
+    sagemaker_session.list_monitoring_executions = MagicMock(
+        return_value=MONITORING_EXECUTIONS_EMPTY
+    )
+    try:
+        model_explainability_monitor.get_latest_execution_logs()
+    except ValueError as ve:
+        assert "No execution jobs were kicked off." in str(ve)
+    sagemaker_session.list_monitoring_executions = MagicMock(
+        return_value=MONITORING_EXECUTIONS_NO_PROCESSING_JOB
+    )
+    try:
+        model_explainability_monitor.get_latest_execution_logs()
+    except ValueError as ve:
+        assert "Processing Job did not run for the last execution" in str(ve)