|
20 | 20 |
|
21 | 21 | from pathlib import Path
|
22 | 22 |
|
| 23 | +from accelerate.commands.estimate import estimate_command_parser, gather_data |
23 | 24 | from sagemaker import Session
|
| 25 | +from sagemaker.djl_inference import defaults |
24 | 26 | from sagemaker.model import Model
|
25 | 27 | from sagemaker.base_predictor import PredictorBase
|
26 | 28 | from sagemaker.serializers import NumpySerializer, TorchTensorSerializer
|
|
39 | 41 | from sagemaker.serve.save_retrive.version_1_0_0.metadata.metadata import Metadata
|
40 | 42 | from sagemaker.serve.spec.inference_spec import InferenceSpec
|
41 | 43 | from sagemaker.serve.utils.predictors import _get_local_mode_predictor
|
| 44 | +from sagemaker.serve.utils.hardware_detector import _get_gpu_info, _get_gpu_info_fallback |
42 | 45 | from sagemaker.serve.detector.image_detector import (
|
43 | 46 | auto_detect_container,
|
44 | 47 | _detect_framework_and_version,
|
|
65 | 68 | ModelServer.DJL_SERVING,
|
66 | 69 | }
|
67 | 70 |
|
| 71 | +MIB_CONVERSION_FACTOR = 0.00000095367431640625 |
| 72 | +MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer |
68 | 73 |
|
69 | 74 | # pylint: disable=attribute-defined-outside-init
|
70 | 75 | @dataclass
|
@@ -567,7 +572,7 @@ def wrapper(*args, **kwargs):
|
567 | 572 | # It supports two modes of deployment
|
568 | 573 | # 1/ SageMaker Endpoint
|
569 | 574 | # 2/ Local launch with container
|
570 |
| - def build( |
| 575 | + def build( # pylint: disable=R0911 |
571 | 576 | self,
|
572 | 577 | mode: Type[Mode] = None,
|
573 | 578 | role_arn: str = None,
|
@@ -616,6 +621,10 @@ def build(
|
616 | 621 | )
|
617 | 622 | if hf_model_md.get("pipeline_tag") == "text-generation": # pylint: disable=R1705
|
618 | 623 | return self._build_for_tgi()
|
| 624 | + elif self.can_fit_on_single_gpu(): |
| 625 | + return self._build_for_transformers() |
| 626 | + elif self.model in defaults.FASTER_TRANSFORMER_SUPPORTED_ARCHITECTURES: |
| 627 | + return self._build_for_djl() |
619 | 628 | else:
|
620 | 629 | return self._build_for_transformers()
|
621 | 630 |
|
@@ -672,3 +681,58 @@ def validate(self, model_dir: str) -> Type[bool]:
|
672 | 681 | """
|
673 | 682 |
|
674 | 683 | return get_metadata(model_dir)
|
| 684 | + |
| 685 | + def total_inference_model_size_mib(self): |
| 686 | + """Calculates the model size from HF accelerate |
| 687 | +
|
| 688 | + This function gets the model size from accelerate. It also adds a |
| 689 | + padding and converts to size MiB. When performing inference, expect |
| 690 | + to add up to an additional 20% to the given model size as found by EleutherAI. |
| 691 | + """ |
| 692 | + dtypes = "float32" |
| 693 | + try: |
| 694 | + if self.env_vars.get("dtypes"): |
| 695 | + dtypes = self.env_vars.get("dtypes") |
| 696 | + |
| 697 | + parser = estimate_command_parser() |
| 698 | + args = parser.parse_args([self.model, "--dtypes", dtypes]) |
| 699 | + except ValueError: |
| 700 | + logging.error("Args specified incorrect for model %s", self.model) |
| 701 | + |
| 702 | + output = gather_data( |
| 703 | + args |
| 704 | + ) # "dtype", "Largest Layer", "Total Size Bytes", "Training using Adam" |
| 705 | + |
| 706 | + total_memory_size_mib = MEMORY_BUFFER_MULTIPLIER * output[0][2] * MIB_CONVERSION_FACTOR |
| 707 | + logger.info("Total memory size MIB: %s", total_memory_size_mib) |
| 708 | + return total_memory_size_mib |
| 709 | + |
| 710 | + def can_fit_on_single_gpu(self): |
| 711 | + """Check if model can fit on a single GPU |
| 712 | +
|
| 713 | + This function gets the GPU info or fallback to set the size of a single GPU. |
| 714 | + If the size of the model is <= single gpu memory size, returns true. |
| 715 | + """ |
| 716 | + try: |
| 717 | + gpu_info = _get_gpu_info(self.instance_type, self.sagemaker_session) |
| 718 | + logger.info("GPU info %s for instance %s", gpu_info, self.instance_type) |
| 719 | + single_gpu_size_mib = gpu_info[1] / gpu_info[0] |
| 720 | + except ValueError: |
| 721 | + gpu_fallback = _get_gpu_info_fallback( |
| 722 | + self.instance_type, self.sagemaker_session.boto_region_name |
| 723 | + ) |
| 724 | + logger.info("GPU fallback picked up %s", gpu_fallback) |
| 725 | + single_gpu_size_mib = gpu_fallback[1] / gpu_fallback[0] |
| 726 | + |
| 727 | + if single_gpu_size_mib is None: |
| 728 | + logger.info("Unable to determine single GPU size for instance %s", self.instance_type) |
| 729 | + return False |
| 730 | + |
| 731 | + if self.total_inference_model_size_mib() <= single_gpu_size_mib: |
| 732 | + logger.info( |
| 733 | + "Total inference model size MIB %s, single GPU size for instance MIB %s", |
| 734 | + self.total_inference_model_size_mib(), |
| 735 | + single_gpu_size_mib, |
| 736 | + ) |
| 737 | + return True |
| 738 | + return False |
0 commit comments