pathway: fix executor's incompatibility with Python 3.7

Zhankuil · Namrata Madan · commit d1dcfaadab00 · 2023-04-18T13:18:20.000-07:00
The Semophore$release API has changed since python 3.9.
diff --git a/src/sagemaker/remote_function/client.py b/src/sagemaker/remote_function/client.py
@@ -20,14 +20,22 @@
 from typing import Dict, List, Tuple, Any
 import functools
 import inspect
+import logging
+
+from botocore.exceptions import ClientError
 
 import sagemaker.remote_function.core.serialization as serialization
 
 from sagemaker.session import Session
 from sagemaker.s3 import s3_path_join
 from sagemaker.remote_function.job import _JobSettings, _Job
 
-_POLLING_INTERVAL_IN_SECS = 10
+
+_API_CALL_LIMIT = {
+    "SubmittingIntervalInSecs": 1,
+    "MinBatchPollingIntervalInSecs": 10,
+    "PollingIntervalInSecs": 0.5,
+}
 
 # Possible future states.
 _PENDING = "PENDING"
@@ -36,6 +44,9 @@
 _CANCELLED = "CANCELLED"
 _FINISHED = "FINISHED"
 
+LOGGER = logging.getLogger(__name__)
+LOGGER.setLevel(logging.INFO)
+
 
 def remote(
     _func=None,
@@ -154,48 +165,73 @@ def __init__(self, future, job_settings: _JobSettings, func, func_args, func_kwa
 
 def _submit_worker(executor):
     """Background worker that submits job requests."""
-    while True:
-        request = executor._pending_request_queue.get(block=True)
+    try:
+        while True:
+            request = executor._pending_request_queue.get(block=True)
 
-        if request is None:
-            return
+            if request is None:
+                return
 
-        executor._semaphore.acquire(blocking=True)
+            executor._semaphore.acquire(blocking=True)
 
-        # submit a new job
-        job = request.future._start_and_notify(
-            request.job_settings, request.func, request.args, request.kwargs
-        )
+            time.sleep(_API_CALL_LIMIT["SubmittingIntervalInSecs"])
+            # submit a new job
+            job = request.future._start_and_notify(
+                request.job_settings, request.func, request.args, request.kwargs
+            )
 
-        if job is None:
-            # job fails to submit
-            executor._semaphore.release(1)
-        else:
-            executor._running_jobs[job.job_name] = job
+            if job is None:
+                # job fails to submit
+                executor._semaphore.release()
+            else:
+                executor._running_jobs[job.job_name] = job
+    except Exception:  # pylint: disable=broad-except
+        LOGGER.exception("Error occurred while submitting CreateTrainingJob requests.")
 
 
 def _polling_worker(executor):
     """Background worker that polls the status of the running jobs."""
-    while True:
-        if executor._shutdown and len(executor._running_jobs) == 0:
-            return
-
-        time.sleep(_POLLING_INTERVAL_IN_SECS)
+    try:
+        while True:
+            if executor._shutdown and len(executor._running_jobs) == 0:
+                return
+
+            time.sleep(
+                max(
+                    _API_CALL_LIMIT["MinBatchPollingIntervalInSecs"]
+                    - len(executor._running_jobs) * _API_CALL_LIMIT["PollingIntervalInSecs"],
+                    0,
+                )
+            )
 
-        # check if running jobs are terminated
-        for job_name in executor._running_jobs.keys():
-            if executor._running_jobs[job_name].describe()["TrainingJobStatus"] in [
-                "Completed",
-                "Failed",
-                "Stopped",
-            ]:
-                del executor._running_jobs[job_name]
-                executor._semaphore.release(1)
+            # check if running jobs are terminated
+            for job_name in executor._running_jobs.keys():
+                try:
+                    time.sleep(_API_CALL_LIMIT["PollingIntervalInSecs"])
+                    if executor._running_jobs[job_name].describe()["TrainingJobStatus"] in [
+                        "Completed",
+                        "Failed",
+                        "Stopped",
+                    ]:
+                        del executor._running_jobs[job_name]
+                        executor._semaphore.release()
+                except Exception as e:  # pylint: disable=broad-except
+                    if (
+                        not isinstance(e, ClientError)
+                        or e.response["Error"]["Code"]  # pylint: disable=no-member
+                        != "LimitExceededException"
+                    ):
+                        # Couldn't check the job status, move on
+                        LOGGER.exception(
+                            "Error occurred while checking the status of job %s", job_name
+                        )
+                        del executor._running_jobs[job_name]
+                        executor._semaphore.release()
+    except Exception:  # pylint: disable=broad-except
+        LOGGER.exception("Error occurred while monitoring the job statuses.")
 
 
 # TODO: 1) add map method.
-#  2) in the background workers, limit rate of calls to CreateTrainingJob
-#  and DescribeTrainingJob APIs
 class RemoteExecutor(object):
     """Run Python functions asynchronously as SageMaker jobs"""
 
diff --git a/tests/unit/sagemaker/remote_function/test_client.py b/tests/unit/sagemaker/remote_function/test_client.py
@@ -18,6 +18,7 @@
 import pytest
 from mock import patch, Mock, ANY, call
 
+from botocore.exceptions import ClientError
 from sagemaker.remote_function.client import remote, RemoteExecutor, Future
 
 TRAINING_JOB_ARN = "training-job-arn"
@@ -47,6 +48,12 @@ def describe_training_job_response(job_status):
 CANCELLED_TRAINING_JOB = describe_training_job_response("Stopped")
 FAILED_TRAINING_JOB = describe_training_job_response("Failed")
 
+API_CALL_LIMIT = {
+    "SubmittingIntervalInSecs": 0.005,
+    "MinBatchPollingIntervalInSecs": 0.01,
+    "PollingIntervalInSecs": 0.01,
+}
+
 
 def job_function(a, b=1, *, c, d=3):
     return a * b * c * d
@@ -171,7 +178,7 @@ def test_executor_submit_after_shutdown():
         e.submit(job_function, 1, 2, c=3, d=4)
 
 
-@patch("sagemaker.remote_function.client._POLLING_INTERVAL_IN_SECS", new=0.01)
+@patch("sagemaker.remote_function.client._API_CALL_LIMIT", new=API_CALL_LIMIT)
 @patch("sagemaker.remote_function.client._Job.start")
 def test_executor_submit_happy_case(mock_start):
     mock_job = Mock()
@@ -194,8 +201,7 @@ def test_executor_submit_happy_case(mock_start):
     mock_job.describe.assert_called()
 
 
-@pytest.mark.skip("This test hangs forever in py37")
-@patch("sagemaker.remote_function.client._POLLING_INTERVAL_IN_SECS", new=0.01)
+@patch("sagemaker.remote_function.client._API_CALL_LIMIT", new=API_CALL_LIMIT)
 @patch("sagemaker.remote_function.client._Job.start")
 def test_executor_submit_enforcing_max_parallel_jobs(mock_start):
     mock_job = Mock()
@@ -220,8 +226,7 @@ def test_executor_submit_enforcing_max_parallel_jobs(mock_start):
     assert future_2.done()
 
 
-@pytest.mark.skip("This test fails in py37")
-@patch("sagemaker.remote_function.client._POLLING_INTERVAL_IN_SECS", new=0.01)
+@patch("sagemaker.remote_function.client._API_CALL_LIMIT", new=API_CALL_LIMIT)
 @patch("sagemaker.remote_function.client._Job.start")
 def test_executor_fails_to_start_job(mock_start):
     mock_job = Mock()
@@ -239,8 +244,7 @@ def test_executor_fails_to_start_job(mock_start):
     assert future_2.done()
 
 
-@pytest.mark.skip("This test hangs forever in py37")
-@patch("sagemaker.remote_function.client._POLLING_INTERVAL_IN_SECS", new=0.01)
+@patch("sagemaker.remote_function.client._API_CALL_LIMIT", new=API_CALL_LIMIT)
 @patch("sagemaker.remote_function.client._Job.start")
 def test_executor_submit_and_cancel(mock_start):
     mock_job = Mock()
@@ -265,6 +269,53 @@ def test_executor_submit_and_cancel(mock_start):
     mock_start.assert_called_once_with(ANY, job_function, (1, 2), {"c": 3, "d": 4})
 
 
+@patch("sagemaker.remote_function.client._API_CALL_LIMIT", new=API_CALL_LIMIT)
+@patch("sagemaker.remote_function.client._Job.start")
+def test_executor_describe_job_throttled_temporarily(mock_start):
+    throttling_error = ClientError(
+        error_response={"Error": {"Code": "LimitExceededException"}},
+        operation_name="SomeOperation",
+    )
+    mock_job = Mock()
+    mock_job.describe.side_effect = [
+        throttling_error,
+        throttling_error,
+        COMPLETED_TRAINING_JOB,
+        COMPLETED_TRAINING_JOB,
+        COMPLETED_TRAINING_JOB,
+        COMPLETED_TRAINING_JOB,
+    ]
+    mock_start.return_value = mock_job
+
+    with RemoteExecutor(max_parallel_job=1, s3_root_uri="s3://bucket/") as e:
+        # submit first job
+        future_1 = e.submit(job_function, 1, 2, c=3, d=4)
+        # submit second job
+        future_2 = e.submit(job_function, 5, 6, c=7, d=8)
+
+    assert future_1.done()
+    assert future_2.done()
+
+
+@patch("sagemaker.remote_function.client._API_CALL_LIMIT", new=API_CALL_LIMIT)
+@patch("sagemaker.remote_function.client._Job.start")
+def test_executor_describe_job_failed_permanently(mock_start):
+    mock_job = Mock()
+    mock_job.describe.side_effect = RuntimeError()
+    mock_start.return_value = mock_job
+
+    with RemoteExecutor(max_parallel_job=1, s3_root_uri="s3://bucket/") as e:
+        # submit first job
+        future_1 = e.submit(job_function, 1, 2, c=3, d=4)
+        # submit second job
+        future_2 = e.submit(job_function, 5, 6, c=7, d=8)
+
+    with pytest.raises(RuntimeError):
+        future_1.done()
+    with pytest.raises(RuntimeError):
+        future_2.done()
+
+
 @pytest.mark.parametrize(
     "args, kwargs, error_message",
     [