14
14
15
15
import base64
16
16
import os
17
+ import requests
17
18
19
+ import botocore
18
20
import docker
19
21
import numpy
20
22
import pytest
30
32
from tests .integ .retry import retries
31
33
from tests .integ .timeout import timeout , timeout_and_delete_endpoint_by_name
32
34
33
- ALGORITHM_NAME = "sagemaker-multimodel-integ-test"
34
35
ROLE = "SageMakerRole"
35
36
PRETRAINED_MODEL_PATH_1 = "customer_a/dummy_model.tar.gz"
36
37
PRETRAINED_MODEL_PATH_2 = "customer_b/dummy_model.tar.gz"
@@ -47,27 +48,36 @@ def container_image(sagemaker_session):
47
48
"sts" , region_name = region , endpoint_url = utils .sts_regional_endpoint (region )
48
49
)
49
50
account_id = sts_client .get_caller_identity ()["Account" ]
51
+ algorithm_name = "sagemaker-multimodel-integ-test-{}" .format (sagemaker_timestamp ())
50
52
ecr_image = "{account}.dkr.ecr.{region}.amazonaws.com/{algorithm_name}:latest" .format (
51
- account = account_id , region = region , algorithm_name = ALGORITHM_NAME
53
+ account = account_id , region = region , algorithm_name = algorithm_name
52
54
)
53
55
54
56
# Build and tag docker image locally
55
57
docker_client = docker .from_env ()
56
58
image , build_log = docker_client .images .build (
57
- path = os .path .join (DATA_DIR , "multimodel" , "container" ), tag = ALGORITHM_NAME , rm = True
59
+ path = os .path .join (DATA_DIR , "multimodel" , "container" ), tag = algorithm_name , rm = True
58
60
)
59
61
image .tag (ecr_image , tag = "latest" )
60
62
61
63
# Create AWS ECR and push the local docker image to it
62
- _create_repository (ecr_client , ALGORITHM_NAME )
64
+ _create_repository (ecr_client , algorithm_name )
63
65
username , password = _ecr_login (ecr_client )
64
- docker_client .images .push (ecr_image , auth_config = {"username" : username , "password" : password })
66
+ # Retry docker image push
67
+ for _ in retries (3 , "Upload docker image to ECR repo" , seconds_to_sleep = 10 ):
68
+ try :
69
+ docker_client .images .push (
70
+ ecr_image , auth_config = {"username" : username , "password" : password }
71
+ )
72
+ break
73
+ except requests .exceptions .ConnectionError :
74
+ # This can happen when we try to create multiple repositories in parallel, so we retry
75
+ pass
76
+
65
77
yield ecr_image
66
78
67
79
# Delete repository after the multi model integration tests complete
68
- repo = ecr_client .describe_repositories (repositoryNames = [ALGORITHM_NAME ])
69
- if "repositories" in repo :
70
- ecr_client .delete_repository (repositoryName = ALGORITHM_NAME , force = True )
80
+ _delete_repository (ecr_client , algorithm_name )
71
81
72
82
73
83
def _create_repository (ecr_client , repository_name ):
@@ -87,6 +97,18 @@ def _create_repository(ecr_client, repository_name):
87
97
raise
88
98
89
99
100
+ def _delete_repository (ecr_client , repository_name ):
101
+ """
102
+ Deletes an ECS Repository (ECR). After the integration test completes
103
+ we will remove the repository created during setup
104
+ """
105
+ try :
106
+ ecr_client .describe_repositories (repositoryNames = [repository_name ])
107
+ ecr_client .delete_repository (repositoryName = repository_name , force = True )
108
+ except botocore .errorfactory .ResourceNotFoundException :
109
+ pass
110
+
111
+
90
112
def _ecr_login (ecr_client ):
91
113
""" Get a login credentials for an ecr client.
92
114
"""
0 commit comments