Skip to content

Commit 56836a1

Browse files
committed
Debugging issue
1 parent 9153c40 commit 56836a1

File tree

2 files changed

+18
-132
lines changed

2 files changed

+18
-132
lines changed

requirements/extras/test_requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,5 @@ onnx==1.14.1
4040
# tf2onnx==1.15.1
4141
nbformat>=5.9,<6
4242
accelerate>=0.24.1,<=0.27.0
43+
nvgpu
44+

tests/integ/sagemaker/serve/test_serve_pt_happy.py

Lines changed: 16 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,14 @@
2424
from sagemaker.serve.spec.inference_spec import InferenceSpec
2525
from torchvision.transforms import transforms
2626
from torchvision.models.squeezenet import squeezenet1_1
27-
27+
import tests.integ
2828
from tests.integ.sagemaker.serve.constants import (
2929
PYTORCH_SQUEEZENET_RESOURCE_DIR,
3030
SERVE_SAGEMAKER_ENDPOINT_TIMEOUT,
3131
PYTHON_VERSION_IS_NOT_310,
3232
)
3333
from tests.integ.timeout import timeout
34-
from tests.integ.utils import cleanup_model_resources
34+
from tests.integ.utils import cleanup_model_resources, gpu_list, retry_with_instance_list
3535
import logging
3636

3737
logger = logging.getLogger(__name__)
@@ -148,69 +148,39 @@ def model_builder(request):
148148
return request.getfixturevalue(request.param)
149149

150150

151-
# @pytest.mark.skipif(
152-
# PYTHON_VERSION_IS_NOT_310,
153-
# reason="The goal of these test are to test the serving components of our feature",
154-
# )
155-
# @pytest.mark.parametrize(
156-
# "model_builder", ["model_builder_inference_spec_schema_builder"], indirect=True
157-
# )
158-
# @pytest.mark.slow_test
159-
# @pytest.mark.flaky(reruns=5, reruns_delay=2)
160-
# def test_happy_pytorch_local_container(sagemaker_session, model_builder, test_image):
161-
# logger.info("Running in LOCAL_CONTAINER mode...")
162-
# caught_ex = None
163-
#
164-
# model = model_builder.build(mode=Mode.LOCAL_CONTAINER, sagemaker_session=sagemaker_session)
165-
#
166-
# with timeout(minutes=SERVE_LOCAL_CONTAINER_TIMEOUT):
167-
# try:
168-
# logger.info("Deploying and predicting in LOCAL_CONTAINER mode...")
169-
# predictor = model.deploy()
170-
# logger.info("Local container successfully deployed.")
171-
# predictor.predict(test_image)
172-
# except Exception as e:
173-
# logger.exception("test failed")
174-
# caught_ex = e
175-
# finally:
176-
# if model.modes[str(Mode.LOCAL_CONTAINER)].container:
177-
# model.modes[str(Mode.LOCAL_CONTAINER)].container.kill()
178-
# if caught_ex:
179-
# assert (
180-
# False
181-
# ), f"{caught_ex} was thrown when running pytorch squeezenet local container test"
182-
183-
184151
@pytest.mark.skipif(
185-
PYTHON_VERSION_IS_NOT_310, # or NOT_RUNNING_ON_INF_EXP_DEV_PIPELINE,
186-
reason="The goal of these test are to test the serving components of our feature",
152+
PYTHON_VERSION_IS_NOT_310,
153+
tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS
154+
and tests.integ.test_region() in tests.integ.TRAINING_NO_P3_REGIONS,
155+
reason="no ml.p2 or ml.p3 instances in this region"
187156
)
157+
@retry_with_instance_list(gpu_list(tests.integ.test_region()))
188158
@pytest.mark.parametrize(
189159
"model_builder", ["model_builder_inference_spec_schema_builder"], indirect=True
190160
)
191161
@pytest.mark.slow_test
192162
def test_happy_pytorch_sagemaker_endpoint(
193163
sagemaker_session,
194164
model_builder,
195-
cpu_instance_type,
196165
test_image,
166+
**kwargs
197167
):
198168
logger.info("Running in SAGEMAKER_ENDPOINT mode...")
199169
caught_ex = None
200-
201-
# iam_client = sagemaker_session.boto_session.client("iam")
202-
# role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"]
203-
204-
model = model_builder.build(mode=Mode.LOCAL_CONTAINER, sagemaker_session=sagemaker_session)
205-
170+
iam_client = sagemaker_session.boto_session.client("iam")
171+
role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"]
172+
model = model_builder.build(role_arn=role_arn, sagemaker_session=sagemaker_session)
206173
with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
207174
try:
208175
logger.info("Deploying and predicting in SAGEMAKER_ENDPOINT mode...")
209-
predictor = model.deploy()
176+
predictor = model.deploy(
177+
mode=Mode.SAGEMAKER_ENDPOINT,
178+
instance_type=kwargs["instance_type"],
179+
initial_instance_count=2,
180+
)
210181
logger.info("Endpoint successfully deployed.")
211182
predictor.predict(test_image)
212183
except Exception as e:
213-
logger.exception("test failed")
214184
caught_ex = e
215185
finally:
216186
cleanup_model_resources(
@@ -223,89 +193,3 @@ def test_happy_pytorch_sagemaker_endpoint(
223193
assert (
224194
False
225195
), f"{caught_ex} was thrown when running pytorch squeezenet sagemaker endpoint test"
226-
227-
228-
# @pytest.mark.skipif(
229-
# PYTHON_VERSION_IS_NOT_310,
230-
# reason="The goal of these test are to test the serving components of our feature",
231-
# )
232-
# @pytest.mark.parametrize(
233-
# "model_builder", ["model_builder_inference_spec_schema_builder"], indirect=True
234-
# )
235-
# @pytest.mark.slow_test
236-
# def test_happy_pytorch_local_container_overwrite_to_sagemaker_endpoint(
237-
# sagemaker_session, model_builder, cpu_instance_type, test_image
238-
# ):
239-
# logger.info("Building model in LOCAL_CONTAINER mode...")
240-
# caught_ex = None
241-
#
242-
# iam_client = sagemaker_session.boto_session.client("iam")
243-
# role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"]
244-
# logger.debug("Role arn: %s", role_arn)
245-
#
246-
# model = model_builder.build(
247-
# mode=Mode.LOCAL_CONTAINER, role_arn=role_arn, sagemaker_session=sagemaker_session
248-
# )
249-
#
250-
# with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
251-
# try:
252-
# logger.info("Deploying and predicting in SAGEMAKER_ENDPOINT mode...")
253-
# predictor = model.deploy(
254-
# instance_type=cpu_instance_type,
255-
# initial_instance_count=1,
256-
# mode=Mode.SAGEMAKER_ENDPOINT,
257-
# )
258-
# logger.info("Endpoint successfully deployed.")
259-
# predictor.predict(test_image)
260-
# except Exception as e:
261-
# caught_ex = e
262-
# finally:
263-
# cleanup_model_resources(
264-
# sagemaker_session=model_builder.sagemaker_session,
265-
# model_name=model.name,
266-
# endpoint_name=model.endpoint_name,
267-
# )
268-
# if caught_ex:
269-
# logger.exception(caught_ex)
270-
# assert (
271-
# False
272-
# ), f"{caught_ex} was thrown when running pytorch squeezenet sagemaker endpoint test"
273-
274-
275-
# @pytest.mark.skipif(
276-
# PYTHON_VERSION_IS_NOT_310,
277-
# reason="The goal of these test are to test the serving components of our feature",
278-
# )
279-
# @pytest.mark.parametrize(
280-
# "model_builder", ["model_builder_inference_spec_schema_builder"], indirect=True
281-
# )
282-
# @pytest.mark.slow_test
283-
# def test_happy_pytorch_sagemaker_endpoint_overwrite_to_local_container(
284-
# sagemaker_session, model_builder, test_image
285-
# ):
286-
# logger.info("Building model in SAGEMAKER_ENDPOINT mode...")
287-
# caught_ex = None
288-
#
289-
# iam_client = sagemaker_session.boto_session.client("iam")
290-
# role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"]
291-
#
292-
# model = model_builder.build(
293-
# mode=Mode.SAGEMAKER_ENDPOINT, role_arn=role_arn, sagemaker_session=sagemaker_session
294-
# )
295-
#
296-
# with timeout(minutes=SERVE_LOCAL_CONTAINER_TIMEOUT):
297-
# try:
298-
# logger.info("Deploying and predicting in LOCAL_CONTAINER mode...")
299-
# predictor = model.deploy(mode=Mode.LOCAL_CONTAINER)
300-
# logger.info("Local container successfully deployed.")
301-
# predictor.predict(test_image)
302-
# except Exception as e:
303-
# logger.exception("test failed")
304-
# caught_ex = e
305-
# finally:
306-
# if model.modes[str(Mode.LOCAL_CONTAINER)].container:
307-
# model.modes[str(Mode.LOCAL_CONTAINER)].container.kill()
308-
# if caught_ex:
309-
# assert (
310-
# False
311-
# ), f"{caught_ex} was thrown when running pytorch squeezenet local container test"

0 commit comments

Comments
 (0)