aws · pintaoz-aws · Dec 4, 2024 · Sep 30, 2024 · Oct 3, 2024 · Oct 3, 2024
@@ -32,6 +32,9 @@ env/
 .python-version
 *.html
 **/_repack_script_launcher.sh
+src/sagemaker/modules/train/container_drivers/sm_train.sh
+src/sagemaker/modules/train/container_drivers/sourcecode.json
+src/sagemaker/modules/train/container_drivers/distributed.json
 tests/data/**/_repack_model.py
 tests/data/experiment/sagemaker-dev-1.0.tar.gz
 src/sagemaker/serve/tmp_workspace
@@ -2,3 +2,4 @@
 inherit = false
 ignore = D104,D107,D202,D203,D213,D214,D400,D401,D404,D406,D407,D411,D413,D414,D415,D417
 match = (?!record_pb2).*\.py
+match-dir = (?!.*test).*
@@ -1,8 +1,10 @@
 recursive-include src/sagemaker *.py
 
 include src/sagemaker/image_uri_config/*.json
+include src/sagemaker/pytorch/training_recipes.json
 include src/sagemaker/serve/schema/*.json
 include src/sagemaker/serve/requirements.txt
+include src/sagemaker/modules/train/sm_recipes/training_recipes.json
 recursive-include requirements *
 
 include VERSION

@@ -5,6 +5,7 @@ Training APIs
 .. toctree::
    :maxdepth: 4
 
+   model_trainer
    algorithm
    analytics
    automl

@@ -0,0 +1,17 @@
+ModelTrainer
+------------
+
+.. autoclass:: sagemaker.modules.train.model_trainer.ModelTrainer
+    :members:
+
+Configs
+~~~~~~~
+
+.. automodule:: sagemaker.modules.configs
+    :members:
+
+Distributed
+~~~~~~~~~~~
+
+.. automodule:: sagemaker.modules.distributed
+    :members:
@@ -21,12 +21,9 @@ To train a PyTorch model by using the SageMaker Python SDK:
 .. |create pytorch estimator| replace:: Create a ``sagemaker.pytorch.PyTorch`` Estimator
 .. _create pytorch estimator: #create-an-estimator
 
-.. |call fit| replace:: Call the estimator's ``fit`` method
-.. _call fit: #call-the-fit-method
-
-1. `Prepare a training script <#prepare-a-pytorch-training-script>`_
+1. `Prepare a training script <#prepare-a-pytorch-training-script>`_ OR `Choose an Amazon SageMaker HyperPod recipe`_
 2. |create pytorch estimator|_
-3. |call fit|_
+3. `Call the estimator's fit method or ModelTrainer's train method`_
 
 Prepare a PyTorch Training Script
 =================================
@@ -175,6 +172,16 @@ see `AWS Deep Learning Containers <https://github.com/aws/deep-learning-containe
 - `Images for HuggingFace <https://github.com/aws/deep-learning-containers/tree/master/huggingface>`__
 
 
+Choose an Amazon Sagemaker HyperPod recipe
+==========================================
+
+Alternatively, instead of using your own training script, you can choose an
+`Amazon SageMaker HyperPod recipe <https://github.com/aws/sagemaker-hyperpod-recipes>`_ to launch training for a supported model.
+If using a recipe, you do not need to provide your own training script. You only need to determine
+which recipe you want to run. You can modify a recipe as explained in the next section.
+
+
+
 Create an Estimator
 ===================
 
@@ -196,10 +203,121 @@ directories ('train' and 'test').
                            'test': 's3://my-data-bucket/path/to/my/test/data'})
 
 
+Amazon Sagemaker HyperPod recipes
+---------------------------------
+Alternatively, if you are using Amazon SageMaker HyperPod recipes, you can follow the following instructions:
 
+Prerequisites: you need ``git`` installed on your client to access Amazon SageMaker HyperPod recipes code.
 
-Call the fit Method
-===================
+When using a recipe, you must set the ``training_recipe`` arg in place of providing a training script.
+This can be a recipe from `here <https://github.com/aws/sagemaker-hyperpod-recipes>`_
+or a local file or a custom url.  Please note that you must override the following using
+``recipe_overrides``:
+
+* directory paths for the local container in the recipe as appropriate for Python SDK
+* the output s3 URIs
+* Huggingface access token
+* any other recipe fields you wish to edit
+
+The code snippet below shows an example.
+Please refer to `SageMaker docs <https://docs.aws.amazon.com/sagemaker/latest/dg/model-train-storage.html>`_
+for more details about the expected local paths in the container and the Amazon SageMaker
+HyperPod recipes tutorial for more examples.
+You can override the fields by either setting ``recipe_overrides`` or
+providing a modified ``training_recipe`` through a local file or a custom url.
+When using the recipe, any provided ``entry_point`` will be ignored.
+
+SageMaker will automatically set up the distribution args.
+It will also determine the image to use for your model and device type,
+but you can override this with the ``image_uri`` arg.
+
+You can also override the number of nodes in the recipe with the ``instance_count`` arg to estimator.
+``source_dir`` will default to current working directory unless specified.
+A local copy of training scripts and recipe will be saved in the ``source_dir``.
+You can specify any additional packages you want to install for training in an optional ``requirements.txt`` in the ``source_dir``.
+
+Note for llama3.2 multi-modal models, you need to upgrade transformers library by providing a ``requirements.txt`` in the source file with ``transformers==4.45.2``.
+Please refer to the Amazon SageMaker HyperPod recipes documentation for more details.
+
+
+Here is an example usage for recipe ``hf_llama3_8b_seq8k_gpu_p5x16_pretrain``.
+
+
+.. code:: python
+
+    overrides = {
+        "run": {
+            "results_dir": "/opt/ml/model",
+        },
+        "exp_manager": {
+            "exp_dir": "",
+            "explicit_log_dir": "/opt/ml/output/tensorboard",
+            "checkpoint_dir": "/opt/ml/checkpoints",
+        },
+        "model": {
+            "data": {
+                "train_dir": "/opt/ml/input/data/train",
+                "val_dir": "/opt/ml/input/data/val",
+            },
+        },
+    }
+    pytorch_estimator = PyTorch(
+      output_path=output_path,
+      base_job_name=f"llama-recipe",
+      role=role,
+      instance_type="ml.p5.48xlarge",
+      training_recipe="hf_llama3_8b_seq8k_gpu_p5x16_pretrain",
+      recipe_overrides=recipe_overrides,
+      sagemaker_session=sagemaker_session,
+      tensorboard_output_config=tensorboard_output_config,
+    )
+    pytorch_estimator.fit({'train': 's3://my-data-bucket/path/to/my/training/data',
+                           'test': 's3://my-data-bucket/path/to/my/test/data'})
+
+    # Or alternatively with ModelTrainer
+    recipe_overrides = {
+        "run": {
+            "results_dir": "/opt/ml/model",
+        },
+        "exp_manager": {
+            "exp_dir": "",
+            "explicit_log_dir": "/opt/ml/output/tensorboard",
+            "checkpoint_dir": "/opt/ml/checkpoints",
+        },
+        "model": {
+            "data": {
+                "train_dir": "/opt/ml/input/data/train",
+                "val_dir": "/opt/ml/input/data/val",
+            },
+        },
+    }
+
+    model_trainer = ModelTrainer.from_recipe(
+        output_path=output_path,
+        base_job_name=f"llama-recipe",
+        training_recipe="training/llama/hf_llama3_8b_seq8k_gpu_p5x16_pretrain",
+        recipe_overrides=recipe_overrides,
+        compute=Compute(instance_type="ml.p5.48xlarge"),
+        sagemaker_session=sagemaker_session
+    ).with_tensorboard_output_config(
+        tensorboard_output_config=tensorboard_output_config
+    )
+
+    train_input = Input(
+        channel_name="train",
+        data_source="s3://my-data-bucket/path/to/my/training/data"
+    )
+
+    test_input = Input(
+        channel_name="test",
+        data_source="s3://my-data-bucket/path/to/my/test/data"
+    )
+
+    model_trainer.train(input_data_config=[train_input, test_input)
+
+
+Call the estimator's fit method or ModelTrainer's train method
+==============================================================
 
 You start your training script by calling ``fit`` on a ``PyTorch`` Estimator. ``fit`` takes both required and optional
 arguments.

@@ -4,6 +4,7 @@ Using the SageMaker Python SDK
 
 SageMaker Python SDK provides several high-level abstractions for working with Amazon SageMaker. These are:
 
+- **ModelTrainer**: New interface encapsulating training on SageMaker.
 - **Estimators**: Encapsulate training on SageMaker.
 - **Models**: Encapsulate built ML models.
 - **Predictors**: Provide real-time inference and transformation using Python data-types against a SageMaker endpoint.
@@ -24,8 +25,8 @@ Train a Model with the SageMaker Python SDK
 To train a model by using the SageMaker Python SDK, you:
 
 1. Prepare a training script
-2. Create an estimator
-3. Call the ``fit`` method of the estimator
+2. Create a ModelTrainer or Estimator
+3. Call the ``train`` method of the ModelTrainer or the ``fit`` method of the Estimator
 
 After you train a model, you can save it, and then serve the model as an endpoint to get real-time inferences or get inferences for an entire dataset by using batch transform.
 
@@ -85,6 +86,46 @@ If you want to use, for example, boolean hyperparameters, you need to specify ``
 For more on training environment variables, please visit `SageMaker Containers <https://github.com/aws/sagemaker-containers>`_.
 
 
+Using ModelTrainer
+==================
+
+To use the ModelTrainer class, you need to provide a few essential parameters such as the training image URI and the source code configuration. The class allows you to spin up a SageMaker training job with minimal parameters, particularly by specifying the source code and training image.
+
+For more information about class definitions see `ModelTrainer <https://sagemaker.readthedocs.io/en/stable/api/training/model_trainer.html>`_.
+
+Example: Launching a Training Job with Custom Script
+
+.. code:: python
+
+    from sagemaker.modules.train import ModelTrainer
+    from sagemaker.modules.configs import SourceCode, InputData
+
+    # Image URI for the training job
+    pytorch_image = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0.0-cpu-py310"
+
+    # Define the script to be run
+    source_code = SourceCode(
+        source_dir="basic-script-mode",
+        requirements="requirements.txt",
+        entry_script="custom_script.py",
+    )
+
+    # Define the ModelTrainer
+    model_trainer = ModelTrainer(
+        training_image=pytorch_image,
+        source_code=source_code,
+        base_job_name="script-mode",
+    )
+
+    # Pass the input data
+    input_data = InputData(
+        channel_name="train",
+        data_source=training_input_path, # S3 path where training data is stored
+    )
+
+    # Start the training job
+    model_trainer.train(input_data_config=[input_data], wait=False)
+
 Using Estimators
 ================
 

@@ -5,3 +5,4 @@ packaging==20.9
 jinja2==3.1.4
 schema==0.7.5
 accelerate>=0.24.1,<=0.27.0
+graphene<4.0
@@ -35,10 +35,12 @@ dependencies = [
   "boto3>=1.34.142,<2.0",
   "cloudpickle==2.2.1",
   "docker",
+  "fastapi",
   "google-pasta",
   "importlib-metadata>=1.4.0,<7.0",
   "jsonschema",
   "numpy>=1.9.0,<2.0",
+  "omegaconf>=2.2,<2.3",
   "packaging>=20.0",
   "pandas",
   "pathos",
@@ -53,6 +55,7 @@ dependencies = [
   "tblib>=1.7.0,<4",
   "tqdm",
   "urllib3>=1.26.8,<3.0.0",
+  "uvicorn"
 ]
 
 [project.scripts]

@@ -49,3 +49,4 @@ uvicorn>=0.30.1
 fastapi==0.115.4
 nest-asyncio
 sagemaker-mlflow>=0.1.0
+deepdiff>=8.0.0
@@ -74,5 +74,6 @@
 )
 
 from sagemaker.debugger import ProfilerConfig, Profiler  # noqa: F401
+from sagemaker.partner_app.auth_provider import PartnerAppAuthProvider  # noqa: F401
 
 __version__ = importlib_metadata.version("sagemaker")
@@ -0,0 +1,27 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""Config Classes for taking in parameters for Batch Inference"""
+
+from __future__ import absolute_import
+from pydantic import BaseModel
+
+
+class BatchTransformInferenceConfig(BaseModel):
+    """Config class for Batch Transform Inference
+
+    * Can be used to deploy from ModelBuilder
+    """
+
+    instance_count: int
+    instance_type: str
+    output_path: str
@@ -116,6 +116,7 @@
 REGION_NAME = "region_name"
 TELEMETRY_OPT_OUT = "TelemetryOptOut"
 NOTEBOOK_JOB = "NotebookJob"
+MODEL_TRAINER = "ModelTrainer"
 
 
 def _simple_path(*args: str):
@@ -142,6 +143,7 @@ def _simple_path(*args: str):
 )
 TRAINING_JOB_ROLE_ARN_PATH = _simple_path(SAGEMAKER, TRAINING_JOB, ROLE_ARN)
 TRAINING_JOB_VPC_CONFIG_PATH = _simple_path(SAGEMAKER, TRAINING_JOB, VPC_CONFIG)
+TRAINING_JOB_TAGS_PATH = _simple_path(SAGEMAKER, TRAINING_JOB, TAGS)
 TRAINING_JOB_SECURITY_GROUP_IDS_PATH = _simple_path(
     TRAINING_JOB_VPC_CONFIG_PATH, SECURITY_GROUP_IDS
 )
@@ -656,6 +658,25 @@ def _simple_path(*args: str):
             "minItems": 1,
             "maxItems": 15,
         },
+        "role": {
+            TYPE: "string",
+            "pattern": r"^arn:aws[a-z\-]*:iam::\d{12}:role/?[a-zA-Z_0-9+=,.@\-_/]+$",
+            "minLength": 20,
+            "maxLength": 2048,
+        },
+        "baseJobName": {TYPE: OBJECT, ADDITIONAL_PROPERTIES: True},
+        "sourceCode": {TYPE: OBJECT, ADDITIONAL_PROPERTIES: True},
+        "distributed": {TYPE: OBJECT, ADDITIONAL_PROPERTIES: True},
+        "compute": {TYPE: OBJECT, ADDITIONAL_PROPERTIES: True},
+        "networking": {TYPE: OBJECT, ADDITIONAL_PROPERTIES: True},
+        "stoppingCondition": {TYPE: OBJECT, ADDITIONAL_PROPERTIES: True},
+        "trainingImage": {TYPE: OBJECT, ADDITIONAL_PROPERTIES: True},
+        "trainingImageConfig": {TYPE: OBJECT, ADDITIONAL_PROPERTIES: True},
+        "algorithmName": {TYPE: OBJECT, ADDITIONAL_PROPERTIES: True},
+        "outputDataConfig": {TYPE: OBJECT, ADDITIONAL_PROPERTIES: True},
+        "trainingInputMode": {TYPE: OBJECT, ADDITIONAL_PROPERTIES: True},
+        "environment": {TYPE: OBJECT, ADDITIONAL_PROPERTIES: True},
+        "hyperparameters": {TYPE: OBJECT, ADDITIONAL_PROPERTIES: True},
     },
     PROPERTIES: {
         SCHEMA_VERSION: {
@@ -709,6 +730,7 @@ def _simple_path(*args: str):
                                         },
                                     },
                                 },
+                                MODEL_TRAINER: {TYPE: OBJECT, ADDITIONAL_PROPERTIES: True},
                                 ESTIMATOR: {
                                     TYPE: OBJECT,
                                     ADDITIONAL_PROPERTIES: False,