@@ -950,6 +950,11 @@ def train( # noqa: C901
950
950
}
951
951
Returns:
952
952
str: ARN of the training job, if it is created.
953
+
954
+ Raises:
955
+ - botocore.exceptions.ClientError: If Sagemaker throws an exception while creating
956
+ training job.
957
+ - ValueError: If both image_uri and algorithm are provided, or if neither is provided.
953
958
"""
954
959
tags = _append_project_tags (format_tags (tags ))
955
960
tags = self ._append_sagemaker_config_tags (
@@ -1033,9 +1038,19 @@ def train( # noqa: C901
1033
1038
)
1034
1039
1035
1040
def submit (request ):
1036
- logger .info ("Creating training-job with name: %s" , job_name )
1037
- logger .debug ("train request: %s" , json .dumps (request , indent = 4 ))
1038
- self .sagemaker_client .create_training_job (** request )
1041
+ try :
1042
+ logger .info ("Creating training-job with name: %s" , job_name )
1043
+ logger .debug ("train request: %s" , json .dumps (request , indent = 4 ))
1044
+ self .sagemaker_client .create_training_job (** request )
1045
+ except Exception as e :
1046
+ troubleshooting = (
1047
+ "https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-python-sdk-troubleshooting.html"
1048
+ "#sagemaker-python-sdk-troubleshooting-create-training-job"
1049
+ )
1050
+ logger .error (
1051
+ "Please check the troubleshooting guide for common errors: %s" , troubleshooting
1052
+ )
1053
+ raise e
1039
1054
1040
1055
self ._intercept_create_request (train_request , submit , self .train .__name__ )
1041
1056
@@ -1342,6 +1357,15 @@ def update_training_job(
1342
1357
remote_debug_config = {
1343
1358
"EnableRemoteDebug": True,
1344
1359
}
1360
+
1361
+ Returns:
1362
+ str: ARN of training job
1363
+
1364
+ Raises:
1365
+ - botocore.exceptions.ClientError: If Sagemaker throws an error while updating training
1366
+ job.
1367
+ - botocore.exceptions.ParamValidationError: If any request parameters are in an invalid
1368
+ format.
1345
1369
"""
1346
1370
# No injections from sagemaker_config because the UpdateTrainingJob API's resource_config
1347
1371
# object accepts fewer parameters than the CreateTrainingJob API, and none that the
@@ -1356,9 +1380,28 @@ def update_training_job(
1356
1380
resource_config = resource_config ,
1357
1381
remote_debug_config = remote_debug_config ,
1358
1382
)
1359
- logger .info ("Updating training job with name %s" , job_name )
1360
- logger .debug ("Update request: %s" , json .dumps (update_training_job_request , indent = 4 ))
1361
- self .sagemaker_client .update_training_job (** update_training_job_request )
1383
+ try :
1384
+ logger .info ("Updating training job with name %s" , job_name )
1385
+ logger .debug ("Update request: %s" , json .dumps (update_training_job_request , indent = 4 ))
1386
+ self .sagemaker_client .update_training_job (** update_training_job_request )
1387
+ except botocore .exceptions .ParamValidationError as e :
1388
+ troubleshooting = (
1389
+ "Incorrect request parameter was provided. Check the API documentation: "
1390
+ "https://docs.aws.amazon.com/sagemaker/latest/APIReference/"
1391
+ "API_UpdateTrainingJob.html#API_UpdateTrainingJob_RequestParameters"
1392
+ )
1393
+ logger .error ("%s" , troubleshooting )
1394
+ raise e
1395
+ except botocore .exceptions .ClientError as e :
1396
+ troubleshooting = (
1397
+ "https://docs.aws.amazon.com/sagemaker/latest/dg/"
1398
+ "sagemaker-python-sdk-troubleshooting.html"
1399
+ "#sagemaker-python-sdk-troubleshooting-update-training-job"
1400
+ )
1401
+ logger .error (
1402
+ "Please check the troubleshooting guide for common errors: %s" , troubleshooting
1403
+ )
1404
+ raise e
1362
1405
1363
1406
def _get_update_training_job_request (
1364
1407
self ,
@@ -1461,6 +1504,10 @@ def process(
1461
1504
* If both `ExperimentName` and `TrialName` are not supplied the trial component
1462
1505
will be unassociated.
1463
1506
* `TrialComponentDisplayName` is used for display in Studio.
1507
+
1508
+ Raises:
1509
+ - botocore.exceptions.ClientError: If Sagemaker throws an error while creating
1510
+ processing job.
1464
1511
"""
1465
1512
tags = _append_project_tags (format_tags (tags ))
1466
1513
tags = self ._append_sagemaker_config_tags (
@@ -1524,9 +1571,20 @@ def process(
1524
1571
)
1525
1572
1526
1573
def submit (request ):
1527
- logger .info ("Creating processing-job with name %s" , job_name )
1528
- logger .debug ("process request: %s" , json .dumps (request , indent = 4 ))
1529
- self .sagemaker_client .create_processing_job (** request )
1574
+ try :
1575
+ logger .info ("Creating processing-job with name %s" , job_name )
1576
+ logger .debug ("process request: %s" , json .dumps (request , indent = 4 ))
1577
+ self .sagemaker_client .create_processing_job (** request )
1578
+ except Exception as e :
1579
+ troubleshooting = (
1580
+ "https://docs.aws.amazon.com/sagemaker/latest/dg/"
1581
+ "sagemaker-python-sdk-troubleshooting.html"
1582
+ "#sagemaker-python-sdk-troubleshooting-create-processing-job"
1583
+ )
1584
+ logger .error (
1585
+ "Please check the troubleshooting guide for common errors: %s" , troubleshooting
1586
+ )
1587
+ raise e
1530
1588
1531
1589
self ._intercept_create_request (process_request , submit , self .process .__name__ )
1532
1590
@@ -4573,6 +4631,10 @@ def create_endpoint(self, endpoint_name, config_name, tags=None, wait=True, live
4573
4631
4574
4632
Returns:
4575
4633
str: Name of the Amazon SageMaker ``Endpoint`` created.
4634
+
4635
+ Raises:
4636
+ botocore.exceptions.ClientError: If Sagemaker throws an exception while creating
4637
+ endpoint.
4576
4638
"""
4577
4639
logger .info ("Creating endpoint with name %s" , endpoint_name )
4578
4640
@@ -4581,16 +4643,26 @@ def create_endpoint(self, endpoint_name, config_name, tags=None, wait=True, live
4581
4643
tags = self ._append_sagemaker_config_tags (
4582
4644
tags , "{}.{}.{}" .format (SAGEMAKER , ENDPOINT , TAGS )
4583
4645
)
4584
-
4585
- res = self .sagemaker_client .create_endpoint (
4586
- EndpointName = endpoint_name , EndpointConfigName = config_name , Tags = tags
4587
- )
4588
- if res :
4589
- self .endpoint_arn = res ["EndpointArn" ]
4590
-
4591
- if wait :
4592
- self .wait_for_endpoint (endpoint_name , live_logging = live_logging )
4593
- return endpoint_name
4646
+ try :
4647
+ res = self .sagemaker_client .create_endpoint (
4648
+ EndpointName = endpoint_name , EndpointConfigName = config_name , Tags = tags
4649
+ )
4650
+ if res :
4651
+ self .endpoint_arn = res ["EndpointArn" ]
4652
+
4653
+ if wait :
4654
+ self .wait_for_endpoint (endpoint_name , live_logging = live_logging )
4655
+ return endpoint_name
4656
+ except Exception as e :
4657
+ troubleshooting = (
4658
+ "https://docs.aws.amazon.com/sagemaker/latest/dg/"
4659
+ "sagemaker-python-sdk-troubleshooting.html"
4660
+ "#sagemaker-python-sdk-troubleshooting-create-endpoint"
4661
+ )
4662
+ logger .error (
4663
+ "Please check the troubleshooting guide for common errors: %s" , troubleshooting
4664
+ )
4665
+ raise e
4594
4666
4595
4667
def endpoint_in_service_or_not (self , endpoint_name : str ):
4596
4668
"""Check whether an Amazon SageMaker ``Endpoint``` is in IN_SERVICE status.
@@ -4635,7 +4707,9 @@ def update_endpoint(self, endpoint_name, endpoint_config_name, wait=True):
4635
4707
str: Name of the Amazon SageMaker ``Endpoint`` being updated.
4636
4708
4637
4709
Raises:
4638
- ValueError: if the endpoint does not already exist
4710
+ - ValueError: if the endpoint does not already exist
4711
+ - botocore.exceptions.ClientError: If SageMaker throws an error while
4712
+ creating endpoint config, describing endpoint or updating endpoint
4639
4713
"""
4640
4714
if not _deployment_entity_exists (
4641
4715
lambda : self .sagemaker_client .describe_endpoint (EndpointName = endpoint_name )
@@ -4645,15 +4719,27 @@ def update_endpoint(self, endpoint_name, endpoint_config_name, wait=True):
4645
4719
"existing endpoint name" .format (endpoint_name )
4646
4720
)
4647
4721
4648
- res = self .sagemaker_client .update_endpoint (
4649
- EndpointName = endpoint_name , EndpointConfigName = endpoint_config_name
4650
- )
4651
- if res :
4652
- self .endpoint_arn = res ["EndpointArn" ]
4722
+ try :
4653
4723
4654
- if wait :
4655
- self .wait_for_endpoint (endpoint_name )
4656
- return endpoint_name
4724
+ res = self .sagemaker_client .update_endpoint (
4725
+ EndpointName = endpoint_name , EndpointConfigName = endpoint_config_name
4726
+ )
4727
+ if res :
4728
+ self .endpoint_arn = res ["EndpointArn" ]
4729
+
4730
+ if wait :
4731
+ self .wait_for_endpoint (endpoint_name )
4732
+ return endpoint_name
4733
+ except Exception as e :
4734
+ troubleshooting = (
4735
+ "https://docs.aws.amazon.com/sagemaker/latest/dg/"
4736
+ "sagemaker-python-sdk-troubleshooting.html"
4737
+ "#sagemaker-python-sdk-troubleshooting-update-endpoint"
4738
+ )
4739
+ logger .error (
4740
+ "Please check the troubleshooting guide for common errors: %s" , troubleshooting
4741
+ )
4742
+ raise e
4657
4743
4658
4744
def is_inference_component_based_endpoint (self , endpoint_name ):
4659
4745
"""Returns 'True' if endpoint is inference-component-based, 'False' otherwise.
@@ -4934,7 +5020,7 @@ def update_inference_component(
4934
5020
return inference_component_name
4935
5021
4936
5022
def delete_inference_component (self , inference_component_name : str , wait : bool = False ):
4937
- """Deletes a InferenceComponent.
5023
+ """Deletes an InferenceComponent.
4938
5024
4939
5025
Args:
4940
5026
inference_component_name (str): Name of the Amazon SageMaker ``InferenceComponent``
@@ -8502,8 +8588,19 @@ def _check_job_status(job, desc, status_key_name):
8502
8588
elif status != "Completed" :
8503
8589
reason = desc .get ("FailureReason" , "(No reason provided)" )
8504
8590
job_type = status_key_name .replace ("JobStatus" , " job" )
8505
- message = "Error for {job_type} {job_name}: {status}. Reason: {reason}" .format (
8506
- job_type = job_type , job_name = job , status = status , reason = reason
8591
+ troubleshooting = (
8592
+ "https://docs.aws.amazon.com/sagemaker/latest/dg/"
8593
+ "sagemaker-python-sdk-troubleshooting.html"
8594
+ )
8595
+ message = (
8596
+ "Error for {job_type} {job_name}: {status}. Reason: {reason}. "
8597
+ "Check troubleshooting guide for common errors: {troubleshooting}"
8598
+ ).format (
8599
+ job_type = job_type ,
8600
+ job_name = job ,
8601
+ status = status ,
8602
+ reason = reason ,
8603
+ troubleshooting = troubleshooting ,
8507
8604
)
8508
8605
if "CapacityError" in str (reason ):
8509
8606
raise exceptions .CapacityError (
0 commit comments