add HF test

philschmid · philschmid · commit f72c632edd77 · 2023-03-29T14:55:27.000+02:00
diff --git a/tests/integ/test_huggingface_torch_distributed.py b/tests/integ/test_huggingface_torch_distributed.py
@@ -0,0 +1,57 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import os
+import pytest
+import sagemaker.utils
+from sagemaker.pytorch import HuggingFace
+from tests.integ import timeout
+from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
+from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
+
+
+@pytest.mark.skip(
+    reason="Disabling until the launch of SM Trainium containers" "This test should be re-enabled later."
+)
+def test_torch_distributed_trn1_pt_mnist(
+    sagemaker_session,
+    huggingface_training_latest_version,
+    huggingface_training_pytorch_latest_version,
+    huggingface_pytorch_latest_training_py_version,
+):
+    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
+        data_path = os.path.join(DATA_DIR, "huggingface")
+        estimator = HuggingFace(
+            py_version=huggingface_pytorch_latest_training_py_version,
+            entry_point=os.path.join(data_path, "run_glue.py"),
+            role="SageMakerRole",
+            transformers_version=huggingface_training_latest_version,
+            pytorch_version=huggingface_training_pytorch_latest_version,
+            instance_count=1,
+            instance_type="g5.12xlarge",
+            hyperparameters={
+                "model_name_or_path": "distilbert-base-cased",
+                "task_name": "wnli",
+                "do_train": True,
+                "do_eval": True,
+                "max_seq_length": 128,
+                "fp16": True,
+                "per_device_train_batch_size": 32,
+                "output_dir": "/opt/ml/model",
+            },
+            distribution={"torch_distributed": {"enabled": True}},
+            sagemaker_session=sagemaker_session,
+            disable_profiler=True,
+        )
+        estimator.fit()