Skip to content

Commit 2091718

Browse files
authored
fix: use unique names for test training jobs (#738)
1 parent 398f4e8 commit 2091718

12 files changed

+157
-144
lines changed

tests/integ/test_byo_estimator.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def test_byo_estimator(sagemaker_session, region):
5454
"""
5555
image_name = registry(region) + "/factorization-machines:1"
5656
training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')
57+
job_name = unique_name_from_base('byo')
5758

5859
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
5960
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
@@ -71,21 +72,19 @@ def test_byo_estimator(sagemaker_session, region):
7172
estimator = Estimator(image_name=image_name,
7273
role='SageMakerRole', train_instance_count=1,
7374
train_instance_type='ml.c4.xlarge',
74-
sagemaker_session=sagemaker_session, base_job_name='test-byo')
75+
sagemaker_session=sagemaker_session)
7576

7677
estimator.set_hyperparameters(num_factors=10,
7778
feature_dim=784,
7879
mini_batch_size=100,
7980
predictor_type='binary_classifier')
8081

8182
# training labels must be 'float32'
82-
estimator.fit({'train': s3_train_data})
83+
estimator.fit({'train': s3_train_data}, job_name=job_name)
8384

84-
endpoint_name = unique_name_from_base('byo')
85-
86-
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
85+
with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
8786
model = estimator.create_model()
88-
predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
87+
predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=job_name)
8988
predictor.serializer = fm_serializer
9089
predictor.content_type = 'application/json'
9190
predictor.deserializer = sagemaker.predictor.json_deserializer
@@ -101,7 +100,7 @@ def test_async_byo_estimator(sagemaker_session, region):
101100
image_name = registry(region) + "/factorization-machines:1"
102101
endpoint_name = unique_name_from_base('byo')
103102
training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')
104-
training_job_name = ""
103+
job_name = unique_name_from_base('byo')
105104

106105
with timeout(minutes=5):
107106
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
@@ -119,19 +118,19 @@ def test_async_byo_estimator(sagemaker_session, region):
119118
estimator = Estimator(image_name=image_name,
120119
role='SageMakerRole', train_instance_count=1,
121120
train_instance_type='ml.c4.xlarge',
122-
sagemaker_session=sagemaker_session, base_job_name='test-byo')
121+
sagemaker_session=sagemaker_session)
123122

124123
estimator.set_hyperparameters(num_factors=10,
125124
feature_dim=784,
126125
mini_batch_size=100,
127126
predictor_type='binary_classifier')
128127

129128
# training labels must be 'float32'
130-
estimator.fit({'train': s3_train_data}, wait=False)
131-
training_job_name = estimator.latest_training_job.name
129+
estimator.fit({'train': s3_train_data}, wait=False, job_name=job_name)
132130

133131
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
134-
estimator = Estimator.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
132+
estimator = Estimator.attach(training_job_name=job_name,
133+
sagemaker_session=sagemaker_session)
135134
model = estimator.create_model()
136135
predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
137136
predictor.serializer = fm_serializer

tests/integ/test_factorization_machines.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525

2626

2727
def test_factorization_machines(sagemaker_session):
28+
job_name = unique_name_from_base('fm')
29+
2830
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
2931
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
3032
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
@@ -37,15 +39,16 @@ def test_factorization_machines(sagemaker_session):
3739
train_instance_type='ml.c4.xlarge',
3840
num_factors=10, predictor_type='regressor',
3941
epochs=2, clip_gradient=1e2, eps=0.001, rescale_grad=1.0 / 100,
40-
sagemaker_session=sagemaker_session, base_job_name='test-fm')
42+
sagemaker_session=sagemaker_session)
4143

4244
# training labels must be 'float32'
43-
fm.fit(fm.record_set(train_set[0][:200], train_set[1][:200].astype('float32')))
45+
fm.fit(fm.record_set(train_set[0][:200], train_set[1][:200].astype('float32')),
46+
job_name=job_name)
4447

45-
endpoint_name = unique_name_from_base('fm')
46-
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
47-
model = FactorizationMachinesModel(fm.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
48-
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
48+
with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
49+
model = FactorizationMachinesModel(fm.model_data, role='SageMakerRole',
50+
sagemaker_session=sagemaker_session)
51+
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=job_name)
4952
result = predictor.predict(train_set[0][:10])
5053

5154
assert len(result) == 10
@@ -54,8 +57,7 @@ def test_factorization_machines(sagemaker_session):
5457

5558

5659
def test_async_factorization_machines(sagemaker_session):
57-
training_job_name = ""
58-
endpoint_name = unique_name_from_base('factorizationMachines')
60+
job_name = unique_name_from_base('fm')
5961

6062
with timeout(minutes=5):
6163
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
@@ -69,22 +71,23 @@ def test_async_factorization_machines(sagemaker_session):
6971
train_instance_type='ml.c4.xlarge',
7072
num_factors=10, predictor_type='regressor',
7173
epochs=2, clip_gradient=1e2, eps=0.001, rescale_grad=1.0 / 100,
72-
sagemaker_session=sagemaker_session, base_job_name='test-fm')
74+
sagemaker_session=sagemaker_session)
7375

7476
# training labels must be 'float32'
75-
fm.fit(fm.record_set(train_set[0][:200], train_set[1][:200].astype('float32')), wait=False)
76-
training_job_name = fm.latest_training_job.name
77+
fm.fit(fm.record_set(train_set[0][:200], train_set[1][:200].astype('float32')),
78+
job_name=job_name,
79+
wait=False)
7780

7881
print("Detached from training job. Will re-attach in 20 seconds")
7982
time.sleep(20)
8083
print("attaching now...")
8184

82-
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
83-
estimator = FactorizationMachines.attach(training_job_name=training_job_name,
85+
with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
86+
estimator = FactorizationMachines.attach(training_job_name=job_name,
8487
sagemaker_session=sagemaker_session)
8588
model = FactorizationMachinesModel(estimator.model_data, role='SageMakerRole',
8689
sagemaker_session=sagemaker_session)
87-
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
90+
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=job_name)
8891
result = predictor.predict(train_set[0][:10])
8992

9093
assert len(result) == 10

tests/integ/test_horovod.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import boto3
2121
import pytest
2222

23+
import sagemaker.utils
2324
import tests.integ as integ
2425
from sagemaker.tensorflow import TensorFlow
2526
from tests.integ import timeout
@@ -30,7 +31,7 @@
3031
@pytest.mark.canary_quick
3132
@pytest.mark.parametrize('instance_type', ['ml.c5.xlarge', 'ml.p3.2xlarge'])
3233
def test_horovod(sagemaker_session, instance_type, tmpdir):
33-
34+
job_name = sagemaker.utils.unique_name_from_base('tf-horovod')
3435
estimator = TensorFlow(entry_point=os.path.join(horovod_dir, 'test_hvd_basic.py'),
3536
role='SageMakerRole',
3637
train_instance_count=2,
@@ -39,11 +40,10 @@ def test_horovod(sagemaker_session, instance_type, tmpdir):
3940
py_version=integ.PYTHON_VERSION,
4041
script_mode=True,
4142
framework_version='1.12',
42-
distributions={'mpi': {'enabled': True}},
43-
base_job_name='test-tf-horovod')
43+
distributions={'mpi': {'enabled': True}})
4444

4545
with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
46-
estimator.fit()
46+
estimator.fit(job_name=job_name)
4747

4848
tmp = str(tmpdir)
4949
extract_files_from_s3(estimator.model_data, tmp)
@@ -59,7 +59,7 @@ def test_horovod(sagemaker_session, instance_type, tmpdir):
5959
(2, 2)])
6060
def test_horovod_local_mode(sagemaker_local_session, instances, processes, tmpdir):
6161
output_path = 'file://%s' % tmpdir
62-
62+
job_name = sagemaker.utils.unique_name_from_base('tf-horovod')
6363
estimator = TensorFlow(entry_point=os.path.join(horovod_dir, 'test_hvd_basic.py'),
6464
role='SageMakerRole',
6565
train_instance_count=2,
@@ -70,11 +70,10 @@ def test_horovod_local_mode(sagemaker_local_session, instances, processes, tmpdi
7070
output_path=output_path,
7171
framework_version='1.12',
7272
distributions={'mpi': {'enabled': True,
73-
'processes_per_host': processes}},
74-
base_job_name='test-tf-horovod')
73+
'processes_per_host': processes}})
7574

7675
with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
77-
estimator.fit()
76+
estimator.fit(job_name=job_name)
7877

7978
tmp = str(tmpdir)
8079
extract_files(output_path.replace('file://', ''), tmp)

tests/integ/test_ipinsights.py

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,30 +25,32 @@
2525

2626

2727
def test_ipinsights(sagemaker_session):
28+
job_name = unique_name_from_base('ipinsights')
29+
2830
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
2931
data_path = os.path.join(DATA_DIR, 'ipinsights')
3032
data_filename = 'train.csv'
3133

3234
with open(os.path.join(data_path, data_filename), 'rb') as f:
3335
num_records = len(f.readlines())
3436

35-
ipinsights = IPInsights(
36-
role='SageMakerRole',
37-
train_instance_count=1,
38-
train_instance_type='ml.c4.xlarge',
39-
num_entity_vectors=10,
40-
vector_dim=100,
41-
sagemaker_session=sagemaker_session,
42-
base_job_name='test-ipinsights')
37+
ipinsights = IPInsights(
38+
role='SageMakerRole',
39+
train_instance_count=1,
40+
train_instance_type='ml.c4.xlarge',
41+
num_entity_vectors=10,
42+
vector_dim=100,
43+
sagemaker_session=sagemaker_session)
4344

4445
record_set = prepare_record_set_from_local_files(data_path, ipinsights.data_location,
45-
num_records, FEATURE_DIM, sagemaker_session)
46-
ipinsights.fit(record_set, None)
47-
48-
endpoint_name = unique_name_from_base('ipinsights')
49-
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
50-
model = IPInsightsModel(ipinsights.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
51-
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
46+
num_records, FEATURE_DIM,
47+
sagemaker_session)
48+
ipinsights.fit(records=record_set, job_name=job_name)
49+
50+
with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
51+
model = IPInsightsModel(ipinsights.model_data, role='SageMakerRole',
52+
sagemaker_session=sagemaker_session)
53+
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=job_name)
5254
assert isinstance(predictor, RealTimePredictor)
5355

5456
predict_input = [['user_1', '1.1.1.1']]

tests/integ/test_kmeans.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828

2929
def test_kmeans(sagemaker_session):
30+
job_name = unique_name_from_base('kmeans')
3031
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3132
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
3233
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
@@ -37,7 +38,7 @@ def test_kmeans(sagemaker_session):
3738

3839
kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
3940
train_instance_type='ml.c4.xlarge',
40-
k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans')
41+
k=10, sagemaker_session=sagemaker_session)
4142

4243
kmeans.init_method = 'random'
4344
kmeans.max_iterations = 1
@@ -61,12 +62,12 @@ def test_kmeans(sagemaker_session):
6162
force_dense='True',
6263
)
6364

64-
kmeans.fit(kmeans.record_set(train_set[0][:100]))
65+
kmeans.fit(kmeans.record_set(train_set[0][:100]), job_name=job_name)
6566

66-
endpoint_name = unique_name_from_base('kmeans')
67-
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
68-
model = KMeansModel(kmeans.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
69-
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
67+
with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
68+
model = KMeansModel(kmeans.model_data, role='SageMakerRole',
69+
sagemaker_session=sagemaker_session)
70+
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=job_name)
7071
result = predictor.predict(train_set[0][:10])
7172

7273
assert len(result) == 10
@@ -81,8 +82,7 @@ def test_kmeans(sagemaker_session):
8182

8283

8384
def test_async_kmeans(sagemaker_session):
84-
training_job_name = ""
85-
endpoint_name = unique_name_from_base('kmeans')
85+
job_name = unique_name_from_base('kmeans')
8686

8787
with timeout(minutes=5):
8888
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
@@ -94,7 +94,7 @@ def test_async_kmeans(sagemaker_session):
9494

9595
kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
9696
train_instance_type='ml.c4.xlarge',
97-
k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans')
97+
k=10, sagemaker_session=sagemaker_session)
9898

9999
kmeans.init_method = 'random'
100100
kmeans.max_iterations = 1
@@ -118,17 +118,17 @@ def test_async_kmeans(sagemaker_session):
118118
force_dense='True',
119119
)
120120

121-
kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False)
122-
training_job_name = kmeans.latest_training_job.name
121+
kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False, job_name=job_name)
123122

124123
print("Detached from training job. Will re-attach in 20 seconds")
125124
time.sleep(20)
126125
print("attaching now...")
127126

128-
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
129-
estimator = KMeans.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
130-
model = KMeansModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
131-
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
127+
with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
128+
estimator = KMeans.attach(training_job_name=job_name, sagemaker_session=sagemaker_session)
129+
model = KMeansModel(estimator.model_data, role='SageMakerRole',
130+
sagemaker_session=sagemaker_session)
131+
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=job_name)
132132
result = predictor.predict(train_set[0][:10])
133133

134134
assert len(result) == 10

tests/integ/test_knn.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525

2626

2727
def test_knn_regressor(sagemaker_session):
28+
job_name = unique_name_from_base('knn')
29+
2830
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
2931
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
3032
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
@@ -36,15 +38,15 @@ def test_knn_regressor(sagemaker_session):
3638
knn = KNN(role='SageMakerRole', train_instance_count=1,
3739
train_instance_type='ml.c4.xlarge',
3840
k=10, predictor_type='regressor', sample_size=500,
39-
sagemaker_session=sagemaker_session, base_job_name='test-knn-rr')
41+
sagemaker_session=sagemaker_session)
4042

4143
# training labels must be 'float32'
42-
knn.fit(knn.record_set(train_set[0][:200], train_set[1][:200].astype('float32')))
44+
knn.fit(knn.record_set(train_set[0][:200], train_set[1][:200].astype('float32')),
45+
job_name=job_name)
4346

44-
endpoint_name = unique_name_from_base('knn')
45-
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
47+
with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
4648
model = KNNModel(knn.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
47-
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
49+
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=job_name)
4850
result = predictor.predict(train_set[0][:10])
4951

5052
assert len(result) == 10
@@ -53,8 +55,7 @@ def test_knn_regressor(sagemaker_session):
5355

5456

5557
def test_async_knn_classifier(sagemaker_session):
56-
training_job_name = ""
57-
endpoint_name = unique_name_from_base('knn')
58+
job_name = unique_name_from_base('knn')
5859

5960
with timeout(minutes=5):
6061
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
@@ -68,22 +69,22 @@ def test_async_knn_classifier(sagemaker_session):
6869
train_instance_count=1, train_instance_type='ml.c4.xlarge',
6970
k=10, predictor_type='classifier', sample_size=500,
7071
index_type='faiss.IVFFlat', index_metric='L2',
71-
sagemaker_session=sagemaker_session, base_job_name='test-knn-cl')
72+
sagemaker_session=sagemaker_session)
7273

7374
# training labels must be 'float32'
74-
knn.fit(knn.record_set(train_set[0][:200], train_set[1][:200].astype('float32')), wait=False)
75-
training_job_name = knn.latest_training_job.name
75+
knn.fit(knn.record_set(train_set[0][:200], train_set[1][:200].astype('float32')),
76+
wait=False, job_name=job_name)
7677

7778
print("Detached from training job. Will re-attach in 20 seconds")
7879
time.sleep(20)
7980
print("attaching now...")
8081

81-
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
82-
estimator = KNN.attach(training_job_name=training_job_name,
82+
with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
83+
estimator = KNN.attach(training_job_name=job_name,
8384
sagemaker_session=sagemaker_session)
8485
model = KNNModel(estimator.model_data, role='SageMakerRole',
8586
sagemaker_session=sagemaker_session)
86-
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
87+
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=job_name)
8788
result = predictor.predict(train_set[0][:10])
8889

8990
assert len(result) == 10

0 commit comments

Comments
 (0)