aws
diff --git a/‎CHANGELOG.md
Lines changed: 6 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 6 additions & 0 deletions
diff --git a/‎VERSION
Lines changed: 1 addition & 1 deletion b/‎VERSION
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/sagemaker/fw_utils.py
Lines changed: 4 additions & 1 deletion b/‎src/sagemaker/fw_utils.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/sagemaker/huggingface/estimator.py
Lines changed: 85 additions & 28 deletions b/‎src/sagemaker/huggingface/estimator.py
Lines changed: 85 additions & 28 deletions
diff --git a/‎src/sagemaker/image_uri_config/pytorch.json
Lines changed: 115 additions & 3 deletions b/‎src/sagemaker/image_uri_config/pytorch.json
Lines changed: 115 additions & 3 deletions
@@ -1,5 +1,11 @@
 # Changelog
 
+## v2.147.0 (2023-04-18)
+
+### Features
+
+ * support different types of deletion mode
+
 ## v2.146.1 (2023-04-17)
 
 ### Bug Fixes and Other Changes
 
@@ -1 +1 @@
-2.146.2.dev0
+2.147.1.dev0
@@ -135,6 +135,7 @@
         "1.12.0",
         "1.12.1",
         "1.13.1",
+        "2.0.0",
     ],
 }
 
@@ -148,10 +149,11 @@
     "1.12.0",
     "1.12.1",
     "1.13.1",
+    "2.0.0",
 ]
 
 
-TORCH_DISTRIBUTED_GPU_SUPPORTED_FRAMEWORK_VERSIONS = ["1.13.1"]
+TORCH_DISTRIBUTED_GPU_SUPPORTED_FRAMEWORK_VERSIONS = ["1.13.1", "2.0.0"]
 
 TRAINIUM_SUPPORTED_DISTRIBUTION_STRATEGIES = ["torch_distributed"]
 TRAINIUM_SUPPORTED_TORCH_DISTRIBUTED_FRAMEWORK_VERSIONS = [
@@ -161,6 +163,7 @@
     "1.12.0",
     "1.12.1",
     "1.13.1",
+    "2.0.0",
 ]
 
 SMDISTRIBUTED_SUPPORTED_STRATEGIES = ["dataparallel", "modelparallel"]
 
@@ -17,12 +17,10 @@
 import re
 from typing import Optional, Union, Dict
 
-from sagemaker.deprecations import renamed_kwargs
 from sagemaker.estimator import Framework, EstimatorBase
 from sagemaker.fw_utils import (
     framework_name_from_image,
-    warn_if_parameter_server_with_multi_gpu,
-    validate_smdistributed,
+    validate_distribution,
 )
 from sagemaker.huggingface.model import HuggingFaceModel
 from sagemaker.vpc_utils import VPC_CONFIG_DEFAULT
@@ -37,6 +35,9 @@ class HuggingFace(Framework):
     """Handle training of custom HuggingFace code."""
 
     _framework_name = "huggingface"
+    LAUNCH_PYTORCH_DDP_ENV_NAME = "sagemaker_pytorch_ddp_enabled"
+    LAUNCH_TORCH_DISTRIBUTED_ENV_NAME = "sagemaker_torch_distributed_enabled"
+    INSTANCE_TYPE_ENV_NAME = "sagemaker_instance_type"
 
     def __init__(
         self,
@@ -142,6 +143,36 @@ def __init__(
                         }
                     }
 
+                **To enable PyTorch DDP:**
+
+                    .. code:: python
+
+                        {
+                            "pytorchddp": {
+                                "enabled": True
+                            }
+                        }
+
+                    To learn more, see `Distributed PyTorch Training
+                    <https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training>`_.
+
+                **To enable Torch Distributed:**
+
+                    This is available for general distributed training on
+                    GPU instances from PyTorch v1.13.1 and later.
+
+                    .. code:: python
+
+                        {
+                            "torch_distributed": {
+                                "enabled": True
+                            }
+                        }
+
+                    This option also supports distributed training on Trn1.
+                    To learn more, see `Distributed PyTorch Training on Trainium
+                    <https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training-on-trainium>`_.
+
                 To enable distributed training with
                 `SageMaker Training Compiler <https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler.html>`_
                 for Hugging Face Transformers with PyTorch:
@@ -182,29 +213,6 @@ def __init__(
 
         self._validate_args(image_uri=image_uri)
 
-        instance_type = renamed_kwargs(
-            "train_instance_type", "instance_type", kwargs.get("instance_type"), kwargs
-        )
-
-        base_framework_name = "tensorflow" if tensorflow_version is not None else "pytorch"
-        base_framework_version = (
-            tensorflow_version if tensorflow_version is not None else pytorch_version
-        )
-
-        if distribution is not None:
-            validate_smdistributed(
-                instance_type=instance_type,
-                framework_name=base_framework_name,
-                framework_version=base_framework_version,
-                py_version=self.py_version,
-                distribution=distribution,
-                image_uri=image_uri,
-            )
-
-            warn_if_parameter_server_with_multi_gpu(
-                training_instance_type=instance_type, distribution=distribution
-            )
-
         if "enable_sagemaker_metrics" not in kwargs:
             kwargs["enable_sagemaker_metrics"] = True
 
@@ -214,6 +222,25 @@ def __init__(
             entry_point, source_dir, hyperparameters, image_uri=image_uri, **kwargs
         )
 
+        if "entry_point" not in kwargs:
+            kwargs["entry_point"] = entry_point
+
+        self.base_framework_name = "tensorflow" if tensorflow_version is not None else "pytorch"
+        self.base_framework_version = (
+            tensorflow_version if tensorflow_version is not None else pytorch_version
+        )
+
+        if distribution is not None:
+            distribution = validate_distribution(
+                distribution,
+                self.instance_groups,
+                self.base_framework_name,
+                self.base_framework_version,
+                py_version,
+                image_uri,
+                kwargs,
+            )
+
         self.distribution = distribution or {}
 
         if compiler_config is not None:
@@ -267,14 +294,44 @@ def _validate_args(self, image_uri):
                 "transformers_version, tensorflow_version and pytorch_version."
             )
 
+    def _huggingface_distribution_configuration(self, distribution):
+        """Returns a dict of distribution config for Hugging Face training
+
+        Args:
+            distribution (dict): A dictionary with information on how to run distributed training.
+        Returns:
+            dict containing Pytorch DDP config
+        """
+        distribution_config = {}
+        pytorch_ddp_enabled = False
+        torch_distributed_enabled = False
+
+        if "pytorchddp" in distribution:
+            pytorch_ddp_enabled = distribution.get("pytorchddp").get("enabled", False)
+        elif "torch_distributed" in distribution:
+            torch_distributed_enabled = distribution.get("torch_distributed").get("enabled", False)
+
+        if pytorch_ddp_enabled:
+            distribution_config[self.LAUNCH_PYTORCH_DDP_ENV_NAME] = pytorch_ddp_enabled
+            if self.instance_type is not None:
+                distribution_config[self.INSTANCE_TYPE_ENV_NAME] = self.instance_type
+        elif torch_distributed_enabled:
+            distribution_config[self.LAUNCH_TORCH_DISTRIBUTED_ENV_NAME] = torch_distributed_enabled
+            if self.instance_type is not None:
+                distribution_config[self.INSTANCE_TYPE_ENV_NAME] = self.instance_type
+        else:
+            distribution_config = self._distribution_configuration(distribution=distribution)
+
+        return distribution_config
+
     def hyperparameters(self):
         """Return hyperparameters used by your custom PyTorch code during model training."""
         hyperparameters = super(HuggingFace, self).hyperparameters()
-        distributed_training_hyperparameters = self._distribution_configuration(
+        additional_hyperparameters = self._huggingface_distribution_configuration(
             distribution=self.distribution
         )
         hyperparameters.update(
-            EstimatorBase._json_encode_hyperparameters(distributed_training_hyperparameters)
+            EstimatorBase._json_encode_hyperparameters(additional_hyperparameters)
         )
 
         if self.compiler_config:
 
@@ -77,7 +77,8 @@
             "1.10": "1.10.2",
             "1.11": "1.11.0",
             "1.12": "1.12.1",
-            "1.13": "1.13.1"
+            "1.13": "1.13.1",
+            "2.0": "2.0.0"
         },
         "versions": {
             "0.4.0": {
@@ -838,6 +839,43 @@
                     "us-west-2": "763104351884"
                 },
                 "repository": "pytorch-inference"
+            },
+            "2.0.0": {
+                "py_versions": [
+                    "py310"
+                ],
+                "registries": {
+                    "af-south-1": "626614931356",
+                    "ap-east-1": "871362719292",
+                    "ap-northeast-1": "763104351884",
+                    "ap-northeast-2": "763104351884",
+                    "ap-northeast-3": "364406365360",
+                    "ap-south-1": "763104351884",
+                    "ap-southeast-1": "763104351884",
+                    "ap-southeast-2": "763104351884",
+                    "ap-southeast-3": "907027046896",
+                    "ap-southeast-4": "457447274322",
+                    "ca-central-1": "763104351884",
+                    "cn-north-1": "727897471807",
+                    "cn-northwest-1": "727897471807",
+                    "eu-central-1": "763104351884",
+                    "eu-north-1": "763104351884",
+                    "eu-west-1": "763104351884",
+                    "eu-west-2": "763104351884",
+                    "eu-west-3": "763104351884",
+                    "eu-south-1": "692866216735",
+                    "me-south-1": "217643126080",
+                    "sa-east-1": "763104351884",
+                    "us-east-1": "763104351884",
+                    "us-east-2": "763104351884",
+                    "us-gov-east-1": "446045086412",
+                    "us-gov-west-1": "442386744353",
+                    "us-iso-east-1": "886529160074",
+                    "us-isob-east-1": "094389454867",
+                    "us-west-1": "763104351884",
+                    "us-west-2": "763104351884"
+                },
+                "repository": "pytorch-inference"
             }
         }
     },
@@ -846,7 +884,8 @@
             "cpu"
         ],
         "version_aliases": {
-            "1.12": "1.12.1"
+            "1.12": "1.12.1",
+            "2.0": "2.0.0"
         },
         "versions": {
             "1.12.1": {
@@ -889,6 +928,41 @@
                 },
                 "repository": "pytorch-inference-graviton",
                 "container_version": {"cpu": "ubuntu20.04"}
+            },
+            "2.0.0": {
+                "py_versions": [
+                    "py310"
+                ],
+                "registries": {
+                    "af-south-1": "626614931356",
+                    "ap-east-1": "871362719292",
+                    "ap-northeast-1": "763104351884",
+                    "ap-northeast-2": "763104351884",
+                    "ap-northeast-3": "364406365360",
+                    "ap-south-1": "763104351884",
+                    "ap-south-2": "772153158452",
+                    "ap-southeast-1": "763104351884",
+                    "ap-southeast-2": "763104351884",
+                    "ap-southeast-3": "907027046896",
+                    "ap-southeast-4": "457447274322",
+                    "ca-central-1": "763104351884",
+                    "eu-central-1": "763104351884",
+                    "eu-central-2": "380420809688",
+                    "eu-north-1": "763104351884",
+                    "eu-west-1": "763104351884",
+                    "eu-west-2": "763104351884",
+                    "eu-west-3": "763104351884",
+                    "eu-south-1": "692866216735",
+                    "eu-south-2": "503227376785",
+                    "me-south-1": "217643126080",
+                    "sa-east-1": "763104351884",
+                    "us-east-1": "763104351884",
+                    "us-east-2": "763104351884",
+                    "us-west-1": "763104351884",
+                    "us-west-2": "763104351884"
+                },
+                "repository": "pytorch-inference-graviton",
+                "container_version": {"cpu": "ubuntu20.04"}
             }
         }
     },
@@ -912,7 +986,8 @@
             "1.10": "1.10.2",
             "1.11": "1.11.0",
             "1.12": "1.12.1",
-            "1.13": "1.13.1"
+            "1.13": "1.13.1",
+            "2.0": "2.0.0"
         },
         "versions": {
             "0.4.0": {
@@ -1674,6 +1749,43 @@
                     "us-west-2": "763104351884"
                 },
                 "repository": "pytorch-training"
+            },
+            "2.0.0": {
+                "py_versions": [
+                    "py310"
+                ],
+                "registries": {
+                    "af-south-1": "626614931356",
+                    "ap-east-1": "871362719292",
+                    "ap-northeast-1": "763104351884",
+                    "ap-northeast-2": "763104351884",
+                    "ap-northeast-3": "364406365360",
+                    "ap-south-1": "763104351884",
+                    "ap-southeast-1": "763104351884",
+                    "ap-southeast-2": "763104351884",
+                    "ap-southeast-3": "907027046896",
+                    "ap-southeast-4": "457447274322",
+                    "ca-central-1": "763104351884",
+                    "cn-north-1": "727897471807",
+                    "cn-northwest-1": "727897471807",
+                    "eu-central-1": "763104351884",
+                    "eu-north-1": "763104351884",
+                    "eu-west-1": "763104351884",
+                    "eu-west-2": "763104351884",
+                    "eu-west-3": "763104351884",
+                    "eu-south-1": "692866216735",
+                    "me-south-1": "217643126080",
+                    "sa-east-1": "763104351884",
+                    "us-east-1": "763104351884",
+                    "us-east-2": "763104351884",
+                    "us-gov-east-1": "446045086412",
+                    "us-gov-west-1": "442386744353",
+                    "us-iso-east-1": "886529160074",
+                    "us-isob-east-1": "094389454867",
+                    "us-west-1": "763104351884",
+                    "us-west-2": "763104351884"
+                },
+                "repository": "pytorch-training"
             }
         }
     }