15
15
import logging
16
16
from typing import Type
17
17
from abc import ABC , abstractmethod
18
- from pathlib import Path
19
18
from datetime import datetime , timedelta
20
19
21
20
from sagemaker .model import Model
31
30
_more_performant ,
32
31
_pretty_print_results ,
33
32
)
33
+ from sagemaker .serve .utils .hf_utils import _get_model_config_properties_from_hf
34
34
from sagemaker .serve .model_server .djl_serving .utils import (
35
- _auto_detect_engine ,
36
- _set_serve_properties ,
37
35
_get_admissible_tensor_parallel_degrees ,
38
36
_get_admissible_dtypes ,
39
37
_get_default_tensor_parallel_degree ,
38
+ _get_default_djl_configurations ,
40
39
)
41
40
from sagemaker .serve .utils .local_hardware import (
42
41
_get_nb_instance ,
45
44
_get_gpu_info_fallback ,
46
45
)
47
46
from sagemaker .serve .model_server .djl_serving .prepare import (
48
- prepare_for_djl_serving ,
49
47
_create_dir_structure ,
50
48
)
51
49
from sagemaker .serve .utils .predictors import DjlLocalModePredictor
52
- from sagemaker .serve .utils .types import ModelServer , _DjlEngine
50
+ from sagemaker .serve .utils .types import ModelServer
53
51
from sagemaker .serve .mode .function_pointers import Mode
54
52
from sagemaker .serve .utils .telemetry_logger import _capture_telemetry
55
- from sagemaker .djl_inference .model import (
56
- DeepSpeedModel ,
57
- FasterTransformerModel ,
58
- HuggingFaceAccelerateModel ,
59
- )
53
+ from sagemaker .djl_inference .model import DJLModel
60
54
from sagemaker .base_predictor import PredictorBase
61
55
62
56
logger = logging .getLogger (__name__ )
63
57
64
58
# Match JumpStart DJL entrypoint format
65
- _DJL_MODEL_BUILDER_ENTRY_POINT = "inference.py"
66
59
_CODE_FOLDER = "code"
67
60
_INVALID_SAMPLE_DATA_EX = (
68
61
'For djl-serving, sample input must be of {"inputs": str, "parameters": dict}, '
@@ -88,14 +81,11 @@ def __init__(self):
88
81
self .vpc_config = None
89
82
self ._original_deploy = None
90
83
self .secret_key = None
91
- self .engine = None
92
84
self .hf_model_config = None
93
85
self ._default_tensor_parallel_degree = None
94
86
self ._default_data_type = None
95
87
self ._default_max_tokens = None
96
- self ._default_max_new_tokens = None
97
88
self .pysdk_model = None
98
- self .overwrite_props_from_file = None
99
89
self .schema_builder = None
100
90
self .env_vars = None
101
91
self .nb_instance_type = None
@@ -130,37 +120,15 @@ def _validate_djl_serving_sample_data(self):
130
120
131
121
def _create_djl_model (self ) -> Type [Model ]:
132
122
"""Placeholder docstring"""
133
- code_dir = str (Path (self .model_path ).joinpath (_CODE_FOLDER ))
134
-
135
- kwargs = {
136
- "model_id" : self .model ,
137
- "role" : self .serve_settings .role_arn ,
138
- "entry_point" : _DJL_MODEL_BUILDER_ENTRY_POINT ,
139
- "dtype" : self ._default_data_type ,
140
- "sagemaker_session" : self .sagemaker_session ,
141
- "source_dir" : code_dir ,
142
- "env" : self .env_vars ,
143
- "hf_hub_token" : self .env_vars .get ("HUGGING_FACE_HUB_TOKEN" ),
144
- "image_config" : self .image_config ,
145
- "vpc_config" : self .vpc_config ,
146
- }
147
-
148
- if self .engine == _DjlEngine .DEEPSPEED :
149
- pysdk_model = DeepSpeedModel (
150
- tensor_parallel_degree = self ._default_tensor_parallel_degree ,
151
- max_tokens = self ._default_max_tokens ,
152
- ** kwargs ,
153
- )
154
- elif self .engine == _DjlEngine .FASTER_TRANSFORMER :
155
- pysdk_model = FasterTransformerModel (
156
- tensor_parallel_degree = self ._default_tensor_parallel_degree ,
157
- ** kwargs ,
158
- )
159
- else :
160
- pysdk_model = HuggingFaceAccelerateModel (
161
- number_of_partitions = self ._default_tensor_parallel_degree ,
162
- ** kwargs ,
163
- )
123
+ pysdk_model = DJLModel (
124
+ model_id = self .model ,
125
+ role = self .serve_settings .role_arn ,
126
+ sagemaker_session = self .sagemaker_session ,
127
+ env = self .env_vars ,
128
+ huggingface_hub_token = self .env_vars .get ("HF_TOKEN" ),
129
+ image_config = self .image_config ,
130
+ vpc_config = self .vpc_config ,
131
+ )
164
132
165
133
if not self .image_uri :
166
134
self .image_uri = pysdk_model .serving_image_uri (self .sagemaker_session .boto_region_name )
@@ -196,7 +164,6 @@ def _djl_model_builder_deploy_wrapper(self, *args, **kwargs) -> Type[PredictorBa
196
164
else :
197
165
raise ValueError ("Mode %s is not supported!" % overwrite_mode )
198
166
199
- manual_set_props = None
200
167
if self .mode == Mode .SAGEMAKER_ENDPOINT :
201
168
if self .nb_instance_type and "instance_type" not in kwargs :
202
169
kwargs .update ({"instance_type" : self .nb_instance_type })
@@ -212,17 +179,9 @@ def _djl_model_builder_deploy_wrapper(self, *args, **kwargs) -> Type[PredictorBa
212
179
default_tensor_parallel_degree = _get_default_tensor_parallel_degree (
213
180
self .hf_model_config , tot_gpus
214
181
)
215
- manual_set_props = {
216
- "option.tensor_parallel_degree" : str (default_tensor_parallel_degree ) + "\n "
217
- }
218
-
219
- prepare_for_djl_serving (
220
- model_path = self .model_path ,
221
- model = self .pysdk_model ,
222
- dependencies = self .dependencies ,
223
- overwrite_props_from_file = self .overwrite_props_from_file ,
224
- manual_set_props = manual_set_props ,
225
- )
182
+ self .pysdk_model .env .update (
183
+ {"TENSOR_PARALLEL_DEGREE" : str (default_tensor_parallel_degree )}
184
+ )
226
185
227
186
serializer = self .schema_builder .input_serializer
228
187
deserializer = self .schema_builder ._output_deserializer
@@ -239,7 +198,7 @@ def _djl_model_builder_deploy_wrapper(self, *args, **kwargs) -> Type[PredictorBa
239
198
timeout if timeout else 1800 ,
240
199
self .secret_key ,
241
200
predictor ,
242
- self .env_vars ,
201
+ self .pysdk_model . env ,
243
202
)
244
203
ram_usage_after = _get_ram_usage_mb ()
245
204
@@ -281,25 +240,22 @@ def _djl_model_builder_deploy_wrapper(self, *args, **kwargs) -> Type[PredictorBa
281
240
282
241
def _build_for_hf_djl (self ):
283
242
"""Placeholder docstring"""
284
- self .overwrite_props_from_file = True
285
243
self .nb_instance_type = _get_nb_instance ()
286
244
287
245
_create_dir_structure (self .model_path )
288
- self .engine , self .hf_model_config = _auto_detect_engine (
289
- self .model , self .env_vars .get ("HUGGING_FACE_HUB_TOKEN" )
290
- )
291
-
292
246
if not hasattr (self , "pysdk_model" ):
293
- (
294
- self ._default_tensor_parallel_degree ,
295
- self ._default_data_type ,
296
- _ ,
297
- self ._default_max_tokens ,
298
- self ._default_max_new_tokens ,
299
- ) = _set_serve_properties (self .hf_model_config , self .schema_builder )
247
+ self .env_vars .update ({"HF_MODEL_ID" : self .model })
248
+ self .hf_model_config = _get_model_config_properties_from_hf (
249
+ self .model , self .env_vars .get ("HF_TOKEN" )
250
+ )
251
+ default_djl_configurations , _default_max_new_tokens = _get_default_djl_configurations (
252
+ self .model , self .hf_model_config , self .schema_builder
253
+ )
254
+ self .env_vars .update (default_djl_configurations )
300
255
self .schema_builder .sample_input ["parameters" ][
301
256
"max_new_tokens"
302
- ] = self ._default_max_new_tokens
257
+ ] = _default_max_new_tokens
258
+ logger .info (f"env vars are { self .env_vars } " )
303
259
self .pysdk_model = self ._create_djl_model ()
304
260
305
261
if self .mode == Mode .LOCAL_CONTAINER :
@@ -316,8 +272,6 @@ def _tune_for_hf_djl(self, max_tuning_duration: int = 1800):
316
272
)
317
273
return self .pysdk_model
318
274
319
- self .overwrite_props_from_file = False
320
-
321
275
admissible_tensor_parallel_degrees = _get_admissible_tensor_parallel_degrees (
322
276
self .hf_model_config
323
277
)
@@ -337,8 +291,9 @@ def _tune_for_hf_djl(self, max_tuning_duration: int = 1800):
337
291
"Trying tensor parallel degree: %s, dtype: %s..." , tensor_parallel_degree , dtype
338
292
)
339
293
340
- self ._default_tensor_parallel_degree = tensor_parallel_degree
341
- self ._default_data_type = dtype
294
+ self .env_vars .update (
295
+ {"TENSOR_PARALLEL_DEGREE" : str (tensor_parallel_degree ), "OPTION_DTYPE" : dtype }
296
+ )
342
297
self .pysdk_model = self ._create_djl_model ()
343
298
344
299
try :
@@ -353,15 +308,15 @@ def _tune_for_hf_djl(self, max_tuning_duration: int = 1800):
353
308
predictor , self .schema_builder .sample_input
354
309
)
355
310
356
- serving_properties = self .pysdk_model .generate_serving_properties ()
311
+ tested_env = self .pysdk_model .env . copy ()
357
312
logger .info (
358
313
"Average latency: %s, throughput/s: %s for configuration: %s" ,
359
314
avg_latency ,
360
315
throughput_per_second ,
361
- serving_properties ,
316
+ tested_env ,
362
317
)
363
318
benchmark_results [avg_latency ] = [
364
- serving_properties ,
319
+ tested_env ,
365
320
p90 ,
366
321
avg_tokens_per_second ,
367
322
throughput_per_second ,
@@ -449,48 +404,43 @@ def _tune_for_hf_djl(self, max_tuning_duration: int = 1800):
449
404
if best_tuned_combination :
450
405
self ._default_tensor_parallel_degree = best_tuned_combination [1 ]
451
406
self ._default_data_type = best_tuned_combination [2 ]
407
+ self .env_vars .update (
408
+ {
409
+ "TENSOR_PARALLEL_DEGREE" : str (self ._default_tensor_parallel_degree ),
410
+ "OPTION_DTYPE" : self ._default_data_type ,
411
+ }
412
+ )
452
413
self .pysdk_model = self ._create_djl_model ()
453
414
454
415
_pretty_print_results (benchmark_results )
455
416
logger .info (
456
417
"Model Configuration: %s was most performant with avg latency: %s, "
457
418
"p90 latency: %s, average tokens per second: %s, throughput/s: %s, "
458
419
"standard deviation of request %s" ,
459
- self .pysdk_model .generate_serving_properties () ,
420
+ self .pysdk_model .env ,
460
421
best_tuned_combination [0 ],
461
422
best_tuned_combination [3 ],
462
423
best_tuned_combination [4 ],
463
424
best_tuned_combination [5 ],
464
425
best_tuned_combination [6 ],
465
426
)
466
427
else :
467
- (
468
- self ._default_tensor_parallel_degree ,
469
- self ._default_data_type ,
470
- _ ,
471
- self ._default_max_tokens ,
472
- self ._default_max_new_tokens ,
473
- ) = _set_serve_properties (self .hf_model_config , self .schema_builder )
428
+ default_djl_configurations , _default_max_new_tokens = _get_default_djl_configurations (
429
+ self .model , self .hf_model_config , self .schema_builder
430
+ )
431
+ self .env_vars .update (default_djl_configurations )
474
432
self .schema_builder .sample_input ["parameters" ][
475
433
"max_new_tokens"
476
- ] = self . _default_max_new_tokens
434
+ ] = _default_max_new_tokens
477
435
self .pysdk_model = self ._create_djl_model ()
478
436
479
437
logger .debug (
480
438
"Failed to gather any tuning results. "
481
439
"Please inspect the stack trace emitted from live logging for more details. "
482
440
"Falling back to default serving.properties: %s" ,
483
- self .pysdk_model .generate_serving_properties () ,
441
+ self .pysdk_model .env ,
484
442
)
485
443
486
- prepare_for_djl_serving (
487
- model_path = self .model_path ,
488
- model = self .pysdk_model ,
489
- dependencies = self .dependencies ,
490
- overwrite_props_from_file = self .overwrite_props_from_file ,
491
- )
492
- self .overwrite_props_from_file = True
493
-
494
444
return self .pysdk_model
495
445
496
446
def _build_for_djl (self ):
0 commit comments