@@ -196,7 +196,7 @@ def _create_estimator(
196
196
image_uri : str ,
197
197
role : str ,
198
198
sagemaker_session : Optional [Session ],
199
- volume_size : int = 30 ,
199
+ volume_size : int ,
200
200
vpc_config : Optional [
201
201
Dict [
202
202
str ,
@@ -453,7 +453,9 @@ def partition(
453
453
self ,
454
454
instance_type : str ,
455
455
s3_output_uri : str = None ,
456
+ s3_output_prefix : str = "aot-partitioned-checkpoints" ,
456
457
job_name : Optional [str ] = None ,
458
+ volume_size : int = 30 ,
457
459
volume_kms_key : Optional [str ] = None ,
458
460
output_kms_key : Optional [str ] = None ,
459
461
use_spot_instances : bool = False ,
@@ -469,8 +471,13 @@ def partition(
469
471
artifacts and output files). If not specified, results are
470
472
stored to a default bucket. If the bucket with the specific name
471
473
does not exist, it will be created.
474
+ s3_output_prefix (str): Name of the prefix where all the partitioned
475
+ checkpoints to be uploaded. If not provided, the default value is
476
+ aot-partitioned-checkpoints.
472
477
job_name (str): Training job name. If not specified, a unique training job
473
478
name will be created.
479
+ volume_size (int): Size in GB of the storage volume to use for
480
+ storing input and output data during training (default: 30).
474
481
volume_kms_key (str): Optional. KMS key ID for encrypting EBS
475
482
volume attached to the training instance (default: None).
476
483
output_kms_key (str): Optional. KMS key ID for encrypting the
@@ -499,20 +506,19 @@ def partition(
499
506
region_name = self .sagemaker_session .boto_session .region_name
500
507
self .image_uri = self .serving_image_uri (region_name )
501
508
502
- deploy_key_prefix = fw_utils .model_code_key_prefix (
503
- self .key_prefix , self .name , self .image_uri
504
- )
505
509
if s3_output_uri is None :
510
+ deploy_key_prefix = fw_utils .model_code_key_prefix (
511
+ self .key_prefix , self .name , self .image_uri
512
+ )
513
+
506
514
bucket , deploy_key_prefix = s3 .determine_bucket_and_prefix (
507
515
bucket = self .bucket ,
508
516
key_prefix = deploy_key_prefix ,
509
517
sagemaker_session = self .sagemaker_session ,
510
518
)
511
519
s3_output_uri = s3_path_join ("s3://" , bucket , deploy_key_prefix )
512
- else :
513
- s3_output_uri = s3_path_join (s3_output_uri , deploy_key_prefix )
514
520
515
- self .save_mp_checkpoint_path = s3_path_join (s3_output_uri , "aot-partitioned-checkpoints" )
521
+ self .save_mp_checkpoint_path = s3_path_join (s3_output_uri , s3_output_prefix )
516
522
517
523
container_def = self ._upload_model_to_s3 (upload_as_tar = False )
518
524
estimator = _create_estimator (
@@ -521,6 +527,7 @@ def partition(
521
527
image_uri = self .image_uri ,
522
528
role = self .role ,
523
529
sagemaker_session = self .sagemaker_session ,
530
+ volume_size = volume_size ,
524
531
vpc_config = self .vpc_config ,
525
532
volume_kms_key = volume_kms_key ,
526
533
output_kms_key = output_kms_key ,
@@ -924,7 +931,9 @@ def partition(
924
931
self ,
925
932
instance_type : str ,
926
933
s3_output_uri : str = None ,
934
+ s3_output_prefix : str = "aot-partitioned-checkpoints" ,
927
935
job_name : Optional [str ] = None ,
936
+ volume_size : int = 30 ,
928
937
volume_kms_key : Optional [str ] = None ,
929
938
output_kms_key : Optional [str ] = None ,
930
939
use_spot_instances : bool = False ,
@@ -940,8 +949,13 @@ def partition(
940
949
artifacts and output files). If not specified, results are
941
950
stored to a default bucket. If the bucket with the specific name
942
951
does not exist, it will be created.
952
+ s3_output_prefix (str): Name of the prefix where all the partitioned
953
+ checkpoints to be uploaded. If not provided, the default value is
954
+ aot-partitioned-checkpoints.
943
955
job_name (str): Training job name. If not specified, a unique training job
944
956
name will be created.
957
+ volume_size (int): Size in GB of the storage volume to use for
958
+ storing input and output data during training (default: 30).
945
959
volume_kms_key (str): Optional. KMS key ID for encrypting EBS
946
960
volume attached to the training instance (default: None).
947
961
output_kms_key (str): Optional. KMS key ID for encrypting the
@@ -969,7 +983,9 @@ def partition(
969
983
super (DeepSpeedModel , self ).partition (
970
984
instance_type ,
971
985
s3_output_uri ,
972
- job_name ,
986
+ s3_output_prefix = s3_output_prefix ,
987
+ job_name = job_name ,
988
+ volume_size = volume_size ,
973
989
volume_kms_key = volume_kms_key ,
974
990
output_kms_key = output_kms_key ,
975
991
use_spot_instances = use_spot_instances ,
@@ -1096,7 +1112,9 @@ def partition(
1096
1112
self ,
1097
1113
instance_type : str ,
1098
1114
s3_output_uri : str = None ,
1115
+ s3_output_prefix : str = "aot-partitioned-checkpoints" ,
1099
1116
job_name : Optional [str ] = None ,
1117
+ volume_size : int = 30 ,
1100
1118
volume_kms_key : Optional [str ] = None ,
1101
1119
output_kms_key : Optional [str ] = None ,
1102
1120
use_spot_instances : bool = False ,
@@ -1112,8 +1130,13 @@ def partition(
1112
1130
artifacts and output files). If not specified, results are
1113
1131
stored to a default bucket. If the bucket with the specific name
1114
1132
does not exist, it will be created.
1133
+ s3_output_prefix (str): Name of the prefix where all the partitioned
1134
+ checkpoints to be uploaded. If not provided, the default value is
1135
+ aot-partitioned-checkpoints.
1115
1136
job_name (str): Training job name. If not specified, a unique training job
1116
1137
name will be created.
1138
+ volume_size (int): Size in GB of the storage volume to use for
1139
+ storing input and output data during training (default: 30).
1117
1140
volume_kms_key (str): Optional. KMS key ID for encrypting EBS
1118
1141
volume attached to the training instance (default: None).
1119
1142
output_kms_key (str): Optional. KMS key ID for encrypting the
0 commit comments