|
20 | 20 |
|
21 | 21 | from pathlib import Path
|
22 | 22 |
|
| 23 | +from accelerate.commands.estimate import estimate_command_parser, gather_data |
23 | 24 | from sagemaker import Session
|
24 | 25 | from sagemaker.model import Model
|
25 | 26 | from sagemaker.base_predictor import PredictorBase
|
| 27 | +from sagemaker.djl_inference.model import _determine_engine_for_model, FasterTransformerModel # noqa: F401 |
26 | 28 | from sagemaker.serializers import NumpySerializer, TorchTensorSerializer
|
27 | 29 | from sagemaker.deserializers import JSONDeserializer, TorchTensorDeserializer
|
28 | 30 | from sagemaker.serve.builder.schema_builder import SchemaBuilder
|
|
39 | 41 | from sagemaker.serve.save_retrive.version_1_0_0.metadata.metadata import Metadata
|
40 | 42 | from sagemaker.serve.spec.inference_spec import InferenceSpec
|
41 | 43 | from sagemaker.serve.utils.predictors import _get_local_mode_predictor
|
| 44 | +from sagemaker.serve.utils.hardware_detector import _get_gpu_info, _get_gpu_info_fallback |
42 | 45 | from sagemaker.serve.detector.image_detector import (
|
43 | 46 | auto_detect_container,
|
44 | 47 | _detect_framework_and_version,
|
|
65 | 68 | ModelServer.DJL_SERVING,
|
66 | 69 | }
|
67 | 70 |
|
| 71 | +MIB_CONVERSION_FACTOR = 0.00000095367431640625 |
| 72 | +MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer |
| 73 | + |
68 | 74 |
|
69 | 75 | # pylint: disable=attribute-defined-outside-init
|
70 | 76 | @dataclass
|
@@ -567,7 +573,7 @@ def wrapper(*args, **kwargs):
|
567 | 573 | # It supports two modes of deployment
|
568 | 574 | # 1/ SageMaker Endpoint
|
569 | 575 | # 2/ Local launch with container
|
570 |
| - def build( |
| 576 | + def build( # pylint: disable=R0911 |
571 | 577 | self,
|
572 | 578 | mode: Type[Mode] = None,
|
573 | 579 | role_arn: str = None,
|
@@ -616,6 +622,10 @@ def build(
|
616 | 622 | )
|
617 | 623 | if hf_model_md.get("pipeline_tag") == "text-generation": # pylint: disable=R1705
|
618 | 624 | return self._build_for_tgi()
|
| 625 | + elif self._can_fit_on_single_gpu(): |
| 626 | + return self._build_for_transformers() |
| 627 | + elif _determine_engine_for_model(self.model) == FasterTransformerModel: |
| 628 | + return self._build_for_djl() |
619 | 629 | else:
|
620 | 630 | return self._build_for_transformers()
|
621 | 631 |
|
@@ -672,3 +682,59 @@ def validate(self, model_dir: str) -> Type[bool]:
|
672 | 682 | """
|
673 | 683 |
|
674 | 684 | return get_metadata(model_dir)
|
| 685 | + |
| 686 | + def _total_inference_model_size_mib(self): |
| 687 | + """Calculates the model size from HF accelerate |
| 688 | +
|
| 689 | + This function gets the model size from accelerate. It also adds a |
| 690 | + padding and converts to size MiB. When performing inference, expect |
| 691 | + to add up to an additional 20% to the given model size as found by EleutherAI. |
| 692 | + """ |
| 693 | + dtypes = "float32" |
| 694 | + try: |
| 695 | + if self.env_vars.get("dtypes"): |
| 696 | + dtypes = self.env_vars.get("dtypes") |
| 697 | + |
| 698 | + parser = estimate_command_parser() |
| 699 | + args = parser.parse_args([self.model, "--dtypes", dtypes]) |
| 700 | + except ValueError: |
| 701 | + logging.error("Args specified incorrect for model %s", self.model) |
| 702 | + |
| 703 | + output = gather_data( |
| 704 | + args |
| 705 | + ) # "dtype", "Largest Layer", "Total Size Bytes", "Training using Adam" |
| 706 | + |
| 707 | + total_memory_size_mib = MEMORY_BUFFER_MULTIPLIER * output[0][2] * MIB_CONVERSION_FACTOR |
| 708 | + logger.info("Total memory size MIB: %s", total_memory_size_mib) |
| 709 | + return total_memory_size_mib |
| 710 | + |
| 711 | + def _can_fit_on_single_gpu(self) -> Type[bool]: |
| 712 | + """Check if model can fit on a single GPU |
| 713 | +
|
| 714 | + This function gets the GPU info or fallback to set the size of a single GPU. |
| 715 | + If the size of the model is <= single gpu memory size, returns true. |
| 716 | + """ |
| 717 | + try: |
| 718 | + gpu_info = _get_gpu_info(self.instance_type, self.sagemaker_session) |
| 719 | + logger.info("GPU info %s for instance %s", gpu_info, self.instance_type) |
| 720 | + single_gpu_size_mib = gpu_info[1] / gpu_info[0] |
| 721 | + except ValueError: |
| 722 | + pass |
| 723 | + try: |
| 724 | + gpu_fallback = _get_gpu_info_fallback( |
| 725 | + self.instance_type, self.sagemaker_session.boto_region_name |
| 726 | + ) |
| 727 | + logger.info("GPU fallback picked up %s", gpu_fallback) |
| 728 | + single_gpu_size_mib = gpu_fallback[1] / gpu_fallback[0] |
| 729 | + except ValueError: |
| 730 | + logger.info("Unable to determine single GPU size for instance %s", self.instance_type) |
| 731 | + return False |
| 732 | + |
| 733 | + if self._total_inference_model_size_mib() <= single_gpu_size_mib: |
| 734 | + logger.info( |
| 735 | + "Total inference model size MIB %s, single GPU size for instance MIB %s", |
| 736 | + self.total_inference_model_size_mib(), |
| 737 | + single_gpu_size_mib, |
| 738 | + ) |
| 739 | + return True |
| 740 | + return False |
0 commit comments