change: fix integration test failures masked by timeout bug

knakad · knakad · commit e864ea35a1a0 · 2019-08-12T15:04:53.000-07:00
For background into the original issue, see aws#968
diff --git a/tests/integ/test_ipinsights.py b/tests/integ/test_ipinsights.py
@@ -26,10 +26,6 @@
 FEATURE_DIM = None
 
 
-@pytest.mark.skip(
-    reason="This test has always failed, but the failure was masked by a bug. "
-    "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968"
-)
 def test_ipinsights(sagemaker_session):
     job_name = unique_name_from_base("ipinsights")
 
@@ -64,6 +60,5 @@ def test_ipinsights(sagemaker_session):
         predict_input = [["user_1", "1.1.1.1"]]
         result = predictor.predict(predict_input)
 
-        assert len(result) == 1
-        for record in result:
-            assert record.label["dot_product"] is not None
+        assert len(result["predictions"]) == 1
+        assert result["predictions"][0]["dot_product"] is not None
diff --git a/tests/integ/test_marketplace.py b/tests/integ/test_marketplace.py
@@ -35,7 +35,7 @@
 # Pre-Trained Model: Scikit Decision Trees - Pretrained Model
 # https://aws.amazon.com/marketplace/pp/prodview-7qop4x5ahrdhe
 #
-# Both are  written by Amazon and are free to subscribe.
+# Both are written by Amazon and are free to subscribe.
 
 ALGORITHM_ARN = (
     "arn:aws:sagemaker:%s:%s:algorithm/scikit-decision-trees-"
diff --git a/tests/integ/test_mxnet_train.py b/tests/integ/test_mxnet_train.py
@@ -96,10 +96,6 @@ def test_deploy_model(mxnet_training_job, sagemaker_session, mxnet_full_version)
         assert "Could not find model" in str(exception.value)
 
 
-@pytest.mark.skip(
-    reason="This test has always failed, but the failure was masked by a bug. "
-    "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968"
-)
 def test_deploy_model_with_tags_and_kms(mxnet_training_job, sagemaker_session, mxnet_full_version):
     endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp())
 
@@ -123,18 +119,20 @@ def test_deploy_model_with_tags_and_kms(mxnet_training_job, sagemaker_session, m
 
         model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name, tags=tags, kms_key=kms_key_arn)
 
-        returned_model = sagemaker_session.describe_model(EndpointName=model.name)
-        returned_model_tags = sagemaker_session.list_tags(ResourceArn=returned_model["ModelArn"])[
-            "Tags"
-        ]
+        returned_model = sagemaker_session.sagemaker_client.describe_model(ModelName=model.name)
+        returned_model_tags = sagemaker_session.sagemaker_client.list_tags(
+            ResourceArn=returned_model["ModelArn"]
+        )["Tags"]
 
-        endpoint = sagemaker_session.describe_endpoint(EndpointName=endpoint_name)
-        endpoint_tags = sagemaker_session.list_tags(ResourceArn=endpoint["EndpointArn"])["Tags"]
+        endpoint = sagemaker_session.sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
+        endpoint_tags = sagemaker_session.sagemaker_client.list_tags(
+            ResourceArn=endpoint["EndpointArn"]
+        )["Tags"]
 
-        endpoint_config = sagemaker_session.describe_endpoint_config(
+        endpoint_config = sagemaker_session.sagemaker_client.describe_endpoint_config(
             EndpointConfigName=endpoint["EndpointConfigName"]
         )
-        endpoint_config_tags = sagemaker_session.list_tags(
+        endpoint_config_tags = sagemaker_session.sagemaker_client.list_tags(
             ResourceArn=endpoint_config["EndpointConfigArn"]
         )["Tags"]
 
@@ -148,10 +146,6 @@ def test_deploy_model_with_tags_and_kms(mxnet_training_job, sagemaker_session, m
         assert endpoint_config["KmsKeyId"] == kms_key_arn
 
 
-@pytest.mark.skip(
-    reason="This test has always failed, but the failure was masked by a bug. "
-    "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968"
-)
 def test_deploy_model_with_update_endpoint(
     mxnet_training_job, sagemaker_session, mxnet_full_version
 ):
@@ -172,26 +166,37 @@ def test_deploy_model_with_update_endpoint(
             framework_version=mxnet_full_version,
         )
         model.deploy(1, "ml.t2.medium", endpoint_name=endpoint_name)
-        old_endpoint = sagemaker_session.describe_endpoint(EndpointName=endpoint_name)
+        old_endpoint = sagemaker_session.sagemaker_client.describe_endpoint(
+            EndpointName=endpoint_name
+        )
         old_config_name = old_endpoint["EndpointConfigName"]
 
         model.deploy(1, "ml.m4.xlarge", update_endpoint=True, endpoint_name=endpoint_name)
-        new_endpoint = sagemaker_session.describe_endpoint(EndpointName=endpoint_name)[
-            "ProductionVariants"
-        ]
-        new_production_variants = new_endpoint["ProductionVariants"]
+
+        # Wait for endpoint to finish updating
+        max_retry_count = 40  # Endpoint update takes ~7min. 40 retries * 30s sleeps = 20min timeout
+        current_retry_count = 0
+        while current_retry_count <= max_retry_count:
+            if current_retry_count >= max_retry_count:
+                raise Exception("Endpoint status not 'InService' within expected timeout.")
+            time.sleep(30)
+            new_endpoint = sagemaker_session.sagemaker_client.describe_endpoint(
+                EndpointName=endpoint_name
+            )
+            current_retry_count += 1
+            if new_endpoint["EndpointStatus"] == "InService":
+                break
+
         new_config_name = new_endpoint["EndpointConfigName"]
+        new_config = sagemaker_session.sagemaker_client.describe_endpoint_config(
+            EndpointConfigName=new_config_name
+        )
 
         assert old_config_name != new_config_name
-        assert new_production_variants["InstanceType"] == "ml.m4.xlarge"
-        assert new_production_variants["InitialInstanceCount"] == 1
-        assert new_production_variants["AcceleratorType"] is None
+        assert new_config["ProductionVariants"][0]["InstanceType"] == "ml.m4.xlarge"
+        assert new_config["ProductionVariants"][0]["InitialInstanceCount"] == 1
 
 
-@pytest.mark.skip(
-    reason="This test has always failed, but the failure was masked by a bug. "
-    "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968"
-)
 def test_deploy_model_with_update_non_existing_endpoint(
     mxnet_training_job, sagemaker_session, mxnet_full_version
 ):
@@ -216,7 +221,7 @@ def test_deploy_model_with_update_non_existing_endpoint(
             framework_version=mxnet_full_version,
         )
         model.deploy(1, "ml.t2.medium", endpoint_name=endpoint_name)
-        sagemaker_session.describe_endpoint(EndpointName=endpoint_name)
+        sagemaker_session.sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
 
         with pytest.raises(ValueError, message=expected_error_message):
             model.deploy(
diff --git a/tests/integ/test_tf_script_mode.py b/tests/integ/test_tf_script_mode.py
@@ -127,10 +127,6 @@ def test_mnist_distributed(sagemaker_session, instance_type):
     )
 
 
-@pytest.mark.skip(
-    reason="This test has always failed, but the failure was masked by a bug. "
-    "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968"
-)
 def test_mnist_async(sagemaker_session):
     estimator = TensorFlow(
         entry_point=SCRIPT,
@@ -168,9 +164,7 @@ def test_mnist_async(sagemaker_session):
         result = predictor.predict(np.zeros(784))
         print("predict result: {}".format(result))
         _assert_endpoint_tags_match(sagemaker_session.sagemaker_client, predictor.endpoint, TAGS)
-        _assert_model_tags_match(
-            sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS
-        )
+        _assert_model_tags_match(sagemaker_session.sagemaker_client, model_name, TAGS)
         _assert_model_name_match(sagemaker_session.sagemaker_client, endpoint_name, model_name)
 
 
diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py
@@ -803,10 +803,6 @@ def test_tuning_chainer(sagemaker_session):
 
 
 @pytest.mark.canary_quick
-@pytest.mark.skip(
-    reason="This test has always failed, but the failure was masked by a bug. "
-    "This test should be fixed. Details in https://github.com/aws/sagemaker-python-sdk/pull/968"
-)
 def test_attach_tuning_pytorch(sagemaker_session):
     mnist_dir = os.path.join(DATA_DIR, "pytorch_mnist")
     mnist_script = os.path.join(mnist_dir, "mnist.py")

Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@`
`35`	`35`	`# Pre-Trained Model: Scikit Decision Trees - Pretrained Model`
`36`	`36`	`# https://aws.amazon.com/marketplace/pp/prodview-7qop4x5ahrdhe`
`37`	`37`	`#`
`38`		`-# Both are written by Amazon and are free to subscribe.`
	`38`	`+# Both are written by Amazon and are free to subscribe.`
`39`	`39`
`40`	`40`	`ALGORITHM_ARN = (`
`41`	`41`	`"arn:aws:sagemaker:%s:%s:algorithm/scikit-decision-trees-"`