Skip to content

Commit 4f14e19

Browse files
committed
fix: Add retry in session.py to check if training is finished
1 parent 284ddbe commit 4f14e19

File tree

1 file changed

+22
-12
lines changed

1 file changed

+22
-12
lines changed

src/sagemaker/session.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
secondary_training_status_changed,
4242
secondary_training_status_message,
4343
sts_regional_endpoint,
44+
retries,
4445
)
4546
from sagemaker import exceptions
4647
from sagemaker.session_settings import SessionSettings
@@ -4691,21 +4692,30 @@ def _train_done(sagemaker_client, job_name, last_desc):
46914692
"""Placeholder docstring"""
46924693
in_progress_statuses = ["InProgress", "Created"]
46934694

4694-
desc = sagemaker_client.describe_training_job(TrainingJobName=job_name)
4695-
status = desc["TrainingJobStatus"]
4695+
for _ in retries(
4696+
max_retry_count=10, # 10*30 = 5min
4697+
exception_message_prefix="Waiting for schedule to leave 'Pending' status",
4698+
seconds_to_sleep=30,
4699+
):
4700+
try:
4701+
desc = sagemaker_client.describe_training_job(TrainingJobName=job_name)
4702+
status = desc["TrainingJobStatus"]
46964703

4697-
if secondary_training_status_changed(desc, last_desc):
4698-
print()
4699-
print(secondary_training_status_message(desc, last_desc), end="")
4700-
else:
4701-
print(".", end="")
4702-
sys.stdout.flush()
4704+
if secondary_training_status_changed(desc, last_desc):
4705+
print()
4706+
print(secondary_training_status_message(desc, last_desc), end="")
4707+
else:
4708+
print(".", end="")
4709+
sys.stdout.flush()
47034710

4704-
if status in in_progress_statuses:
4705-
return desc, False
4711+
if status in in_progress_statuses:
4712+
return desc, False
47064713

4707-
print()
4708-
return desc, True
4714+
print()
4715+
return desc, True
4716+
except botocore.exceptions.ClientError as err:
4717+
if err.response["Error"]["Code"] == "AccessDeniedException":
4718+
pass
47094719

47104720

47114721
def _processing_job_status(sagemaker_client, job_name):

0 commit comments

Comments
 (0)