fix: Handle instance support for Hugging Face tests (#3729)

tejaschumbalkar · NivekNey · andre-marcos-perez · web-flow · commit 8d282c1a899a · 2023-03-20T17:15:03.000-07:00
Co-authored-by: Kevin &lt;yenkevin1203@gmail.com&gt;
Co-authored-by: André Perez &lt;andre.marcos.perez@gmail.com&gt;
Co-authored-by: Mufaddal Rohawala &lt;89424143+mufaddal-rohawala@users.noreply.github.com&gt;
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -567,11 +567,16 @@ def gpu_instance_type(sagemaker_session, request):
 
 @pytest.fixture()
 def gpu_pytorch_instance_type(sagemaker_session, request):
-    if "pytorch_inference_version" in request.fixturenames:
-        fw_version = request.getfixturevalue("pytorch_inference_version")
-    else:
+    fw_version = None
+    for pytorch_version_fixture in [
+        "pytorch_inference_version",
+        "huggingface_training_pytorch_latest_version",
+        "huggingface_inference_pytorch_latest_version",
+    ]:
+        if pytorch_version_fixture in request.fixturenames:
+            fw_version = request.getfixturevalue(pytorch_version_fixture)
+    if fw_version is None:
         fw_version = request.param
-
     region = sagemaker_session.boto_session.region_name
     if region in NO_P3_REGIONS:
         if Version(fw_version) >= Version("1.13"):
diff --git a/tests/integ/test_huggingface.py b/tests/integ/test_huggingface.py
@@ -19,27 +19,19 @@
 from sagemaker.huggingface import HuggingFace, HuggingFaceProcessor
 from sagemaker.huggingface.model import HuggingFaceModel, HuggingFacePredictor
 from sagemaker.utils import unique_name_from_base
-from tests import integ
-from tests.integ.utils import gpu_list, retry_with_instance_list
 from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
 from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
 
 ROLE = "SageMakerRole"
 
 
 @pytest.mark.release
-@pytest.mark.skipif(
-    integ.test_region() in integ.TRAINING_NO_P2_REGIONS
-    and integ.test_region() in integ.TRAINING_NO_P3_REGIONS,
-    reason="no ml.p2 or ml.p3 instances in this region",
-)
-@retry_with_instance_list(gpu_list(integ.test_region()))
 def test_framework_processing_job_with_deps(
     sagemaker_session,
     huggingface_training_latest_version,
     huggingface_training_pytorch_latest_version,
     huggingface_pytorch_latest_training_py_version,
-    **kwargs,
+    gpu_pytorch_instance_type,
 ):
     with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
         code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs")
@@ -51,7 +43,7 @@ def test_framework_processing_job_with_deps(
             py_version=huggingface_pytorch_latest_training_py_version,
             role=ROLE,
             instance_count=1,
-            instance_type=kwargs["instance_type"],
+            instance_type=gpu_pytorch_instance_type,
             sagemaker_session=sagemaker_session,
             base_job_name="test-huggingface",
         )
@@ -64,18 +56,12 @@ def test_framework_processing_job_with_deps(
 
 
 @pytest.mark.release
-@pytest.mark.skipif(
-    integ.test_region() in integ.TRAINING_NO_P2_REGIONS
-    and integ.test_region() in integ.TRAINING_NO_P3_REGIONS,
-    reason="no ml.p2 or ml.p3 instances in this region",
-)
-@retry_with_instance_list(gpu_list(integ.test_region()))
 def test_huggingface_training(
     sagemaker_session,
     huggingface_training_latest_version,
     huggingface_training_pytorch_latest_version,
     huggingface_pytorch_latest_training_py_version,
-    **kwargs,
+    gpu_pytorch_instance_type,
 ):
     with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
         data_path = os.path.join(DATA_DIR, "huggingface")
@@ -87,7 +73,7 @@ def test_huggingface_training(
             transformers_version=huggingface_training_latest_version,
             pytorch_version=huggingface_training_pytorch_latest_version,
             instance_count=1,
-            instance_type=kwargs["instance_type"],
+            instance_type=gpu_pytorch_instance_type,
             hyperparameters={
                 "model_name_or_path": "distilbert-base-cased",
                 "task_name": "wnli",
@@ -111,14 +97,6 @@ def test_huggingface_training(
 
 
 @pytest.mark.release
-@pytest.mark.skipif(
-    integ.test_region() in integ.TRAINING_NO_P2_REGIONS
-    and integ.test_region() in integ.TRAINING_NO_P3_REGIONS,
-    reason="no ml.p2 or ml.p3 instances in this region",
-)
-@pytest.mark.skip(
-    reason="need to re enable it later t.corp:V609860141",
-)
 def test_huggingface_training_tf(
     sagemaker_session,
     gpu_instance_type,
@@ -161,7 +139,7 @@ def test_huggingface_training_tf(
 )
 def test_huggingface_inference(
     sagemaker_session,
-    gpu_instance_type,
+    gpu_pytorch_instance_type,
     huggingface_inference_latest_version,
     huggingface_inference_pytorch_latest_version,
     huggingface_pytorch_latest_inference_py_version,
@@ -182,7 +160,9 @@ def test_huggingface_inference(
     )
     with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
         model.deploy(
-            instance_type=gpu_instance_type, initial_instance_count=1, endpoint_name=endpoint_name
+            instance_type=gpu_pytorch_instance_type,
+            initial_instance_count=1,
+            endpoint_name=endpoint_name,
         )
 
         predictor = HuggingFacePredictor(endpoint_name=endpoint_name)