14
14
from __future__ import absolute_import
15
15
16
16
import json
17
+ import os .path
18
+ import tempfile
17
19
from enum import Enum
18
20
from typing import Optional , Union , Dict
19
21
20
- from sagemaker import s3 , Predictor
22
+ import sagemaker
23
+ from sagemaker import s3 , Predictor , image_uris , fw_utils
21
24
from sagemaker .deserializers import JSONDeserializer
22
25
from sagemaker .model import FrameworkModel
23
26
from sagemaker .serializers import JSONSerializer
@@ -71,24 +74,49 @@ def __new__(
71
74
def __init__ (
72
75
self ,
73
76
uncompressed_model_data : str ,
77
+ djl_version : str = None ,
74
78
task : str = None ,
75
79
data_type : str = None ,
76
80
tensor_parallel_degree : int = None ,
81
+ min_workers : int = None ,
82
+ max_workers : int = None ,
83
+ job_queue_size : int = None ,
84
+ parallel_loading : bool = False ,
85
+ model_loading_timeout : int = None ,
86
+ prediction_timeout : int = None ,
77
87
role : str = None ,
78
88
entry_point : Optional [str ] = None ,
79
89
image_uri : Optional [Union [str , PipelineVariable ]] = None ,
80
90
predictor_cls : callable = DJLLargeModelPredictor ,
81
91
** kwargs
82
92
):
83
93
self .uncompressed_model_data = uncompressed_model_data
94
+ self .djl_version = djl_version
84
95
self .task = task
85
96
self .data_type = data_type
86
- self .tensor_parallel_degree = tensor_parallel_degree ,
97
+ self .tensor_parallel_degree = tensor_parallel_degree
98
+ self .min_workers = min_workers
99
+ self .max_workers = max_workers
100
+ self .job_queue_size = job_queue_size
101
+ self .parallel_loading = parallel_loading
102
+ self .model_loading_timeout = model_loading_timeout
103
+ self .prediction_timeout = prediction_timeout
87
104
super (DJLLargeModel , self ).__init__ (
88
105
None , image_uri , role , entry_point , predictor_cls = predictor_cls ,** kwargs
89
106
)
90
107
self .sagemaker_session = self .sagemaker_session or Session ()
91
108
109
+ def serving_image_uri (self , region_name ):
110
+ if not self .djl_version :
111
+ self .djl_version = "0.20.0"
112
+
113
+ return image_uris .retrieve (
114
+ self ._framework (),
115
+ region_name ,
116
+ version = self .djl_version ,
117
+ )
118
+
119
+
92
120
def _determine_engine_for_model_type (model_type : str ):
93
121
if model_type in defaults .DEEPSPEED_RECOMMENDED_ARCHITECTURES :
94
122
return DeepSpeedModel
@@ -102,6 +130,8 @@ def _validate_engine_for_model_type(model_type: str, engine: DJLEngine):
102
130
103
131
class DeepSpeedModel (DJLLargeModel ):
104
132
133
+ _framework_name = "djl-deepspeed"
134
+
105
135
def __init__ (
106
136
self ,
107
137
uncompressed_model_data : str ,
@@ -138,8 +168,92 @@ def __init__(
138
168
** kwargs ,
139
169
)
140
170
171
+ def prepare_container_def (
172
+ self ,
173
+ instance_type = None ,
174
+ accelerator_type = None ,
175
+ serverless_inference_config = None ,
176
+ ):
177
+ if serverless_inference_config is not None :
178
+ raise ValueError ("DJLLargeModel does not support serverless deployment" )
179
+ if accelerator_type is not None :
180
+ raise ValueError ("DJLLargeModel does not support Elastic Inference accelerator" )
181
+
182
+ deploy_image = self .image_uri
183
+ if not deploy_image :
184
+ region_name = self .sagemaker_session .boto_session .region_name
185
+ deploy_image = self .serving_image_uri (region_name )
186
+
187
+ print (f"Deploy image is{ deploy_image } " )
188
+ tmp_dir = self ._validate_and_write_serving_properties ()
189
+ deploy_key_prefix = fw_utils .model_code_key_prefix (self .key_prefix , self .name , deploy_image )
190
+ bucket = self .bucket or self .sagemaker_session .default_bucket ()
191
+ print (f"bucket to upload code to is { bucket } " )
192
+ # self.uploaded_code = fw_utils.tar_and_upload_dir(
193
+ # session=self.sagemaker_session.boto_session,
194
+ # bucket=bucket,
195
+ # s3_key_prefix=deploy_key_prefix,
196
+ # directory=tmp_dir,
197
+ # script=None,
198
+ # )
199
+
200
+ def _validate_and_write_serving_properties (self ):
201
+ serving_properties = {
202
+ "engine" : "DeepSpeed" ,
203
+ "option.entryPoint" : "djl_python.deepspeed" ,
204
+ "option.s3url" : self .uncompressed_model_data ,
205
+ }
206
+ if self .max_tokens :
207
+ serving_properties ["option.max_tokens" ] = self .max_tokens
208
+ if self .low_cpu_mem_usage :
209
+ serving_properties ["option.low_cpu_mem_usage" ] = self .low_cpu_mem_usage
210
+ if self .enable_cuda_graph :
211
+ if self .tensor_parallel_degree > 1 :
212
+ raise ValueError ("enable_cuda_graph is not supported when tensor_parallel_degree > 1" )
213
+ serving_properties ["option.enable_cuda_graph" ] = self .enable_cuda_graph
214
+ if self .triangular_masking :
215
+ serving_properties ["option.triangular_masking" ] = self .triangular_masking
216
+ if self .return_tuple :
217
+ serving_properties ["option.return_tuple" ] = self .return_tuple
218
+ if self .deepspeed_checkpoint_file :
219
+ serving_properties ["option.checkpoint" ] = self .deepspeed_checkpoint_file
220
+ if self .tensor_parallel_degree :
221
+ serving_properties ["option.tensor_parallel_degree" ] = self .tensor_parallel_degree
222
+ if self .entry_point :
223
+ serving_properties ["entryPoint" ] = self .entry_point
224
+ if self .task :
225
+ serving_properties ["option.task" ] = self .task
226
+ if self .data_type :
227
+ serving_properties ["option.dtype" ] = self .data_type
228
+ if self .min_workers :
229
+ serving_properties ["minWorkers" ] = self .min_workers
230
+ if self .max_workers :
231
+ serving_properties ["maxWorkers" ] = self .max_workers
232
+ if self .job_queue_size :
233
+ serving_properties ["job_queue_size" ] = self .job_queue_size
234
+ if self .parallel_loading :
235
+ serving_properties ["option.parallel_loading" ] = self .parallel_loading
236
+ if self .model_loading_timeout :
237
+ serving_properties ["option.model_loading_timeout" ] = self .model_loading_timeout
238
+ if self .prediction_timeout :
239
+ serving_properties ["option.prediction_timeout" ] = self .prediction_timeout
240
+
241
+ local_dir = None if self .sagemaker_session .settings else self .sagemaker_session .settings .local_download_dir
242
+ tmp_dir = tempfile .mkdtemp (dir = local_dir )
243
+
244
+ with open (os .path .join (tmp_dir , "serving.properties" ), 'w+' ) as f :
245
+ for key , value in serving_properties .items ():
246
+ f .write (f"{ key } ={ value } \n " )
247
+
248
+ print (f"wrote serving.properties to { tmp_dir } " )
249
+
250
+ return tmp_dir
251
+
252
+
141
253
class HuggingfaceAccelerateModel (DJLLargeModel ):
142
254
255
+ _framework_name = "djl-deepspeed"
256
+
143
257
def __init__ (
144
258
self ,
145
259
uncompressed_model_data : str ,
@@ -172,3 +286,21 @@ def __init__(
172
286
** kwargs
173
287
)
174
288
289
+ if __name__ == "__main__" :
290
+ session = Session ()
291
+ role = "arn:aws:iam::125045733377:role/AmazonSageMaker-ExecutionRole-djl"
292
+ opt_model = DJLLargeModel (
293
+ "s3://dlc-deepspeed-test-temp/opt-2.7b/" ,
294
+ tensor_parallel_degree = 2 ,
295
+ data_type = "fp32" ,
296
+ task = "text-generation" ,
297
+ max_tokens = 2048 ,
298
+ parallel_loading = True ,
299
+ role = role ,
300
+ sagemaker_session = session ,
301
+ )
302
+ opt_model .prepare_container_def ()
303
+ # opt_model.deploy(
304
+ # initial_instance_count=1,
305
+ # instance_type="ml.g5.12xl"
306
+ # )
0 commit comments