Skip to content

Commit e240518

Browse files
Aditi2424adishaa
andauthored
change: Add troubleshooting links to exceptions (#4844)
Co-authored-by: adishaa <[email protected]>
1 parent f089b5a commit e240518

File tree

6 files changed

+181
-40
lines changed

6 files changed

+181
-40
lines changed

.pylintrc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -384,7 +384,7 @@ max-returns=6
384384
max-branches=12
385385

386386
# Maximum number of statements in function / method body
387-
max-statements=100
387+
max-statements=105
388388

389389
# Maximum number of parents for a class (see R0901).
390390
max-parents=7

src/sagemaker/algorithm.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,20 @@ def __init__(
157157
available (default: ``None``).
158158
**kwargs: Additional kwargs. This is unused. It's only added for AlgorithmEstimator
159159
to ignore the irrelevant arguments.
160+
161+
Raises:
162+
ValueError:
163+
- If an AWS IAM Role is not provided.
164+
- Bad value for instance type.
165+
RuntimeError:
166+
- When setting up custom VPC, both subnets and security_group_ids are not provided
167+
- If instance_count > 1 (distributed training) with instance type local or local gpu
168+
- If LocalSession is not used with instance type local or local gpu
169+
- file:// output path used outside of local mode
170+
botocore.exceptions.ClientError:
171+
- algorithm arn is incorrect
172+
- insufficient permission to access/ describe algorithm
173+
- algorithm is in a different region
160174
"""
161175
self.algorithm_arn = algorithm_arn
162176
super(AlgorithmEstimator, self).__init__(

src/sagemaker/base_predictor.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,8 @@ def update_endpoint(
430430
- If ``initial_instance_count``, ``instance_type``, or ``accelerator_type`` is
431431
specified and either ``model_name`` is ``None`` or there are multiple models
432432
associated with the endpoint.
433+
botocore.exceptions.ClientError: If SageMaker throws an error while creating
434+
endpoint config, describing endpoint or updating endpoint
433435
"""
434436
production_variants = None
435437
current_model_names = self._get_model_names()

src/sagemaker/estimator.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -590,25 +590,36 @@ def __init__(
590590
self.dependencies = dependencies or []
591591
self.uploaded_code: Optional[UploadedCode] = None
592592

593-
# Check that the user properly sets both subnet and secutiry_groupe_ids
593+
# Check that the user properly sets both subnet and security_group_ids
594594
if (
595595
subnets is not None
596596
and security_group_ids is None
597597
or security_group_ids is not None
598598
and subnets is None
599599
):
600+
troubleshooting = (
601+
"Refer to this documentation on using custom VPC: "
602+
"https://sagemaker.readthedocs.io/en/v2.24.0/overview.html"
603+
"#secure-training-and-inference-with-vpc"
604+
)
605+
logger.error("Check troubleshooting guide for common errors: %s", troubleshooting)
606+
600607
raise RuntimeError(
601608
"When setting up custom VPC, both subnets and security_group_ids must be set"
602609
)
603610

604611
if self.instance_type in ("local", "local_gpu"):
605612
if self.instance_type == "local_gpu" and self.instance_count > 1:
606-
raise RuntimeError("Distributed Training in Local GPU is not supported")
613+
raise RuntimeError(
614+
"Distributed Training in Local GPU is not supported."
615+
" Set instance_count to 1."
616+
)
607617
self.sagemaker_session = sagemaker_session or LocalSession()
608618
if not isinstance(self.sagemaker_session, sagemaker.local.LocalSession):
609619
raise RuntimeError(
610620
"instance_type local or local_gpu is only supported with an"
611-
"instance of LocalSession"
621+
"instance of LocalSession. More details on local mode: "
622+
"https://sagemaker.readthedocs.io/en/stable/overview.html#local-mode"
612623
)
613624
else:
614625
self.sagemaker_session = sagemaker_session or Session()
@@ -631,7 +642,11 @@ def __init__(
631642
and not is_pipeline_variable(output_path)
632643
and output_path.startswith("file://")
633644
):
634-
raise RuntimeError("file:// output paths are only supported in Local Mode")
645+
raise RuntimeError(
646+
"The 'file://' output paths are only supported when using Local Mode. "
647+
"To resolve this issue, ensure you're running in Local Mode with a LocalSession, "
648+
"or use an 's3://' output path for jobs running on SageMaker instances."
649+
)
635650
self.output_path = output_path
636651
self.latest_training_job = None
637652
self.jobs = []
@@ -646,7 +661,12 @@ def __init__(
646661
# Now we marked that as Optional because we can fetch it from SageMakerConfig
647662
# Because of marking that parameter as optional, we should validate if it is None, even
648663
# after fetching the config.
649-
raise ValueError("An AWS IAM role is required to create an estimator.")
664+
raise ValueError(
665+
"An AWS IAM role is required to create an estimator. "
666+
"Please provide a valid `role` argument with the ARN of an IAM role"
667+
" that has the necessary SageMaker permissions."
668+
)
669+
650670
self.output_kms_key = resolve_value_from_config(
651671
output_kms_key, TRAINING_JOB_KMS_KEY_ID_PATH, sagemaker_session=self.sagemaker_session
652672
)
@@ -1855,6 +1875,8 @@ def model_data(self):
18551875
if compression_type not in {"GZIP", "NONE"}:
18561876
raise ValueError(
18571877
f'Unrecognized training job output data compression type "{compression_type}"'
1878+
'. Please specify either "GZIP" or "NONE" as valid options for '
1879+
"the compression type."
18581880
)
18591881
# model data is in uncompressed form NOTE SageMaker Hosting mandates presence of
18601882
# trailing forward slash in S3 model data URI, so append one if necessary.

src/sagemaker/local/entities.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,10 @@ def start(self, input_data_config, output_data_config, hyperparameters, environm
213213
hyperparameters (dict): The HyperParameters for the training job.
214214
environment (dict): The collection of environment variables passed to the job.
215215
job_name (str): Name of the local training job being run.
216+
217+
Raises:
218+
ValueError: If the input data configuration is not valid.
219+
RuntimeError: If the data distribution type is not supported.
216220
"""
217221
for channel in input_data_config:
218222
if channel["DataSource"] and "S3DataSource" in channel["DataSource"]:
@@ -233,10 +237,12 @@ def start(self, input_data_config, output_data_config, hyperparameters, environm
233237
# use a single Data URI - this makes handling S3 and File Data easier down the stack
234238
channel["DataUri"] = data_uri
235239

236-
if data_distribution and data_distribution != "FullyReplicated":
240+
supported_distributions = ["FullyReplicated"]
241+
if data_distribution and data_distribution not in supported_distributions:
237242
raise RuntimeError(
238-
"DataDistribution: %s is not currently supported in Local Mode"
239-
% data_distribution
243+
"Invalid DataDistribution: '{}'. Local mode currently supports: {}.".format(
244+
data_distribution, ", ".join(supported_distributions)
245+
)
240246
)
241247

242248
self.start_time = datetime.datetime.now()

src/sagemaker/session.py

Lines changed: 128 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -950,6 +950,11 @@ def train( # noqa: C901
950950
}
951951
Returns:
952952
str: ARN of the training job, if it is created.
953+
954+
Raises:
955+
- botocore.exceptions.ClientError: If Sagemaker throws an exception while creating
956+
training job.
957+
- ValueError: If both image_uri and algorithm are provided, or if neither is provided.
953958
"""
954959
tags = _append_project_tags(format_tags(tags))
955960
tags = self._append_sagemaker_config_tags(
@@ -1033,9 +1038,19 @@ def train( # noqa: C901
10331038
)
10341039

10351040
def submit(request):
1036-
logger.info("Creating training-job with name: %s", job_name)
1037-
logger.debug("train request: %s", json.dumps(request, indent=4))
1038-
self.sagemaker_client.create_training_job(**request)
1041+
try:
1042+
logger.info("Creating training-job with name: %s", job_name)
1043+
logger.debug("train request: %s", json.dumps(request, indent=4))
1044+
self.sagemaker_client.create_training_job(**request)
1045+
except Exception as e:
1046+
troubleshooting = (
1047+
"https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-python-sdk-troubleshooting.html"
1048+
"#sagemaker-python-sdk-troubleshooting-create-training-job"
1049+
)
1050+
logger.error(
1051+
"Please check the troubleshooting guide for common errors: %s", troubleshooting
1052+
)
1053+
raise e
10391054

10401055
self._intercept_create_request(train_request, submit, self.train.__name__)
10411056

@@ -1342,6 +1357,15 @@ def update_training_job(
13421357
remote_debug_config = {
13431358
"EnableRemoteDebug": True,
13441359
}
1360+
1361+
Returns:
1362+
str: ARN of training job
1363+
1364+
Raises:
1365+
- botocore.exceptions.ClientError: If Sagemaker throws an error while updating training
1366+
job.
1367+
- botocore.exceptions.ParamValidationError: If any request parameters are in an invalid
1368+
format.
13451369
"""
13461370
# No injections from sagemaker_config because the UpdateTrainingJob API's resource_config
13471371
# object accepts fewer parameters than the CreateTrainingJob API, and none that the
@@ -1356,9 +1380,28 @@ def update_training_job(
13561380
resource_config=resource_config,
13571381
remote_debug_config=remote_debug_config,
13581382
)
1359-
logger.info("Updating training job with name %s", job_name)
1360-
logger.debug("Update request: %s", json.dumps(update_training_job_request, indent=4))
1361-
self.sagemaker_client.update_training_job(**update_training_job_request)
1383+
try:
1384+
logger.info("Updating training job with name %s", job_name)
1385+
logger.debug("Update request: %s", json.dumps(update_training_job_request, indent=4))
1386+
self.sagemaker_client.update_training_job(**update_training_job_request)
1387+
except botocore.exceptions.ParamValidationError as e:
1388+
troubleshooting = (
1389+
"Incorrect request parameter was provided. Check the API documentation: "
1390+
"https://docs.aws.amazon.com/sagemaker/latest/APIReference/"
1391+
"API_UpdateTrainingJob.html#API_UpdateTrainingJob_RequestParameters"
1392+
)
1393+
logger.error("%s", troubleshooting)
1394+
raise e
1395+
except botocore.exceptions.ClientError as e:
1396+
troubleshooting = (
1397+
"https://docs.aws.amazon.com/sagemaker/latest/dg/"
1398+
"sagemaker-python-sdk-troubleshooting.html"
1399+
"#sagemaker-python-sdk-troubleshooting-update-training-job"
1400+
)
1401+
logger.error(
1402+
"Please check the troubleshooting guide for common errors: %s", troubleshooting
1403+
)
1404+
raise e
13621405

13631406
def _get_update_training_job_request(
13641407
self,
@@ -1461,6 +1504,10 @@ def process(
14611504
* If both `ExperimentName` and `TrialName` are not supplied the trial component
14621505
will be unassociated.
14631506
* `TrialComponentDisplayName` is used for display in Studio.
1507+
1508+
Raises:
1509+
- botocore.exceptions.ClientError: If Sagemaker throws an error while creating
1510+
processing job.
14641511
"""
14651512
tags = _append_project_tags(format_tags(tags))
14661513
tags = self._append_sagemaker_config_tags(
@@ -1524,9 +1571,20 @@ def process(
15241571
)
15251572

15261573
def submit(request):
1527-
logger.info("Creating processing-job with name %s", job_name)
1528-
logger.debug("process request: %s", json.dumps(request, indent=4))
1529-
self.sagemaker_client.create_processing_job(**request)
1574+
try:
1575+
logger.info("Creating processing-job with name %s", job_name)
1576+
logger.debug("process request: %s", json.dumps(request, indent=4))
1577+
self.sagemaker_client.create_processing_job(**request)
1578+
except Exception as e:
1579+
troubleshooting = (
1580+
"https://docs.aws.amazon.com/sagemaker/latest/dg/"
1581+
"sagemaker-python-sdk-troubleshooting.html"
1582+
"#sagemaker-python-sdk-troubleshooting-create-processing-job"
1583+
)
1584+
logger.error(
1585+
"Please check the troubleshooting guide for common errors: %s", troubleshooting
1586+
)
1587+
raise e
15301588

15311589
self._intercept_create_request(process_request, submit, self.process.__name__)
15321590

@@ -4573,6 +4631,10 @@ def create_endpoint(self, endpoint_name, config_name, tags=None, wait=True, live
45734631
45744632
Returns:
45754633
str: Name of the Amazon SageMaker ``Endpoint`` created.
4634+
4635+
Raises:
4636+
botocore.exceptions.ClientError: If Sagemaker throws an exception while creating
4637+
endpoint.
45764638
"""
45774639
logger.info("Creating endpoint with name %s", endpoint_name)
45784640

@@ -4581,16 +4643,26 @@ def create_endpoint(self, endpoint_name, config_name, tags=None, wait=True, live
45814643
tags = self._append_sagemaker_config_tags(
45824644
tags, "{}.{}.{}".format(SAGEMAKER, ENDPOINT, TAGS)
45834645
)
4584-
4585-
res = self.sagemaker_client.create_endpoint(
4586-
EndpointName=endpoint_name, EndpointConfigName=config_name, Tags=tags
4587-
)
4588-
if res:
4589-
self.endpoint_arn = res["EndpointArn"]
4590-
4591-
if wait:
4592-
self.wait_for_endpoint(endpoint_name, live_logging=live_logging)
4593-
return endpoint_name
4646+
try:
4647+
res = self.sagemaker_client.create_endpoint(
4648+
EndpointName=endpoint_name, EndpointConfigName=config_name, Tags=tags
4649+
)
4650+
if res:
4651+
self.endpoint_arn = res["EndpointArn"]
4652+
4653+
if wait:
4654+
self.wait_for_endpoint(endpoint_name, live_logging=live_logging)
4655+
return endpoint_name
4656+
except Exception as e:
4657+
troubleshooting = (
4658+
"https://docs.aws.amazon.com/sagemaker/latest/dg/"
4659+
"sagemaker-python-sdk-troubleshooting.html"
4660+
"#sagemaker-python-sdk-troubleshooting-create-endpoint"
4661+
)
4662+
logger.error(
4663+
"Please check the troubleshooting guide for common errors: %s", troubleshooting
4664+
)
4665+
raise e
45944666

45954667
def endpoint_in_service_or_not(self, endpoint_name: str):
45964668
"""Check whether an Amazon SageMaker ``Endpoint``` is in IN_SERVICE status.
@@ -4635,7 +4707,9 @@ def update_endpoint(self, endpoint_name, endpoint_config_name, wait=True):
46354707
str: Name of the Amazon SageMaker ``Endpoint`` being updated.
46364708
46374709
Raises:
4638-
ValueError: if the endpoint does not already exist
4710+
- ValueError: if the endpoint does not already exist
4711+
- botocore.exceptions.ClientError: If SageMaker throws an error while
4712+
creating endpoint config, describing endpoint or updating endpoint
46394713
"""
46404714
if not _deployment_entity_exists(
46414715
lambda: self.sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
@@ -4645,15 +4719,27 @@ def update_endpoint(self, endpoint_name, endpoint_config_name, wait=True):
46454719
"existing endpoint name".format(endpoint_name)
46464720
)
46474721

4648-
res = self.sagemaker_client.update_endpoint(
4649-
EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
4650-
)
4651-
if res:
4652-
self.endpoint_arn = res["EndpointArn"]
4722+
try:
46534723

4654-
if wait:
4655-
self.wait_for_endpoint(endpoint_name)
4656-
return endpoint_name
4724+
res = self.sagemaker_client.update_endpoint(
4725+
EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
4726+
)
4727+
if res:
4728+
self.endpoint_arn = res["EndpointArn"]
4729+
4730+
if wait:
4731+
self.wait_for_endpoint(endpoint_name)
4732+
return endpoint_name
4733+
except Exception as e:
4734+
troubleshooting = (
4735+
"https://docs.aws.amazon.com/sagemaker/latest/dg/"
4736+
"sagemaker-python-sdk-troubleshooting.html"
4737+
"#sagemaker-python-sdk-troubleshooting-update-endpoint"
4738+
)
4739+
logger.error(
4740+
"Please check the troubleshooting guide for common errors: %s", troubleshooting
4741+
)
4742+
raise e
46574743

46584744
def is_inference_component_based_endpoint(self, endpoint_name):
46594745
"""Returns 'True' if endpoint is inference-component-based, 'False' otherwise.
@@ -4934,7 +5020,7 @@ def update_inference_component(
49345020
return inference_component_name
49355021

49365022
def delete_inference_component(self, inference_component_name: str, wait: bool = False):
4937-
"""Deletes a InferenceComponent.
5023+
"""Deletes an InferenceComponent.
49385024
49395025
Args:
49405026
inference_component_name (str): Name of the Amazon SageMaker ``InferenceComponent``
@@ -8502,8 +8588,19 @@ def _check_job_status(job, desc, status_key_name):
85028588
elif status != "Completed":
85038589
reason = desc.get("FailureReason", "(No reason provided)")
85048590
job_type = status_key_name.replace("JobStatus", " job")
8505-
message = "Error for {job_type} {job_name}: {status}. Reason: {reason}".format(
8506-
job_type=job_type, job_name=job, status=status, reason=reason
8591+
troubleshooting = (
8592+
"https://docs.aws.amazon.com/sagemaker/latest/dg/"
8593+
"sagemaker-python-sdk-troubleshooting.html"
8594+
)
8595+
message = (
8596+
"Error for {job_type} {job_name}: {status}. Reason: {reason}. "
8597+
"Check troubleshooting guide for common errors: {troubleshooting}"
8598+
).format(
8599+
job_type=job_type,
8600+
job_name=job,
8601+
status=status,
8602+
reason=reason,
8603+
troubleshooting=troubleshooting,
85078604
)
85088605
if "CapacityError" in str(reason):
85098606
raise exceptions.CapacityError(

0 commit comments

Comments
 (0)