12
12
# language governing permissions and limitations under the License.
13
13
from __future__ import absolute_import
14
14
15
- import gzip
16
15
import os
17
- import pickle
18
- import pytest
19
- import tests .integ
20
16
17
+ import airflow
18
+ import pytest
21
19
import numpy as np
20
+ from airflow import DAG
21
+ from airflow .contrib .operators .sagemaker_training_operator import SageMakerTrainingOperator
22
+ from airflow .contrib .operators .sagemaker_transform_operator import SageMakerTransformOperator
23
+ from six .moves .urllib .parse import urlparse
22
24
25
+ import tests .integ
23
26
from sagemaker import (
24
27
KMeans ,
25
28
FactorizationMachines ,
39
42
from sagemaker .pytorch .estimator import PyTorch
40
43
from sagemaker .sklearn import SKLearn
41
44
from sagemaker .tensorflow import TensorFlow
42
- from sagemaker .workflow import airflow as sm_airflow
43
45
from sagemaker .utils import sagemaker_timestamp
44
-
45
- import airflow
46
- from airflow import DAG
47
- from airflow .contrib .operators .sagemaker_training_operator import SageMakerTrainingOperator
48
- from airflow .contrib .operators .sagemaker_transform_operator import SageMakerTransformOperator
49
-
46
+ from sagemaker .workflow import airflow as sm_airflow
50
47
from sagemaker .xgboost import XGBoost
51
- from tests .integ import DATA_DIR , PYTHON_VERSION
48
+ from tests .integ import datasets , DATA_DIR , PYTHON_VERSION
52
49
from tests .integ .record_set import prepare_record_set_from_local_files
53
50
from tests .integ .timeout import timeout
54
51
55
- from six .moves .urllib .parse import urlparse
56
-
57
52
PYTORCH_MNIST_DIR = os .path .join (DATA_DIR , "pytorch_mnist" )
58
53
PYTORCH_MNIST_SCRIPT = os .path .join (PYTORCH_MNIST_DIR , "mnist.py" )
59
54
AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS = 10
@@ -100,12 +95,6 @@ def test_byo_airflow_config_uploads_data_source_to_s3_when_inputs_provided(
100
95
@pytest .mark .canary_quick
101
96
def test_kmeans_airflow_config_uploads_data_source_to_s3 (sagemaker_session , cpu_instance_type ):
102
97
with timeout (seconds = AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS ):
103
- data_path = os .path .join (DATA_DIR , "one_p_mnist" , "mnist.pkl.gz" )
104
-
105
- # Load the data into memory as numpy arrays
106
- with gzip .open (data_path , "rb" ) as f :
107
- train_set , _ , _ = pickle .load (f , encoding = "latin1" )
108
-
109
98
kmeans = KMeans (
110
99
role = ROLE ,
111
100
train_instance_count = SINGLE_INSTANCE_COUNT ,
@@ -124,7 +113,7 @@ def test_kmeans_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_
124
113
kmeans .center_factor = 1
125
114
kmeans .eval_metrics = ["ssd" , "msd" ]
126
115
127
- records = kmeans .record_set (train_set [0 ][:100 ])
116
+ records = kmeans .record_set (datasets . one_p_mnist () [0 ][:100 ])
128
117
129
118
training_config = _build_airflow_workflow (
130
119
estimator = kmeans , instance_type = cpu_instance_type , inputs = records
@@ -138,12 +127,6 @@ def test_kmeans_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_
138
127
139
128
def test_fm_airflow_config_uploads_data_source_to_s3 (sagemaker_session , cpu_instance_type ):
140
129
with timeout (seconds = AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS ):
141
- data_path = os .path .join (DATA_DIR , "one_p_mnist" , "mnist.pkl.gz" )
142
-
143
- # Load the data into memory as numpy arrays
144
- with gzip .open (data_path , "rb" ) as f :
145
- train_set , _ , _ = pickle .load (f , encoding = "latin1" )
146
-
147
130
fm = FactorizationMachines (
148
131
role = ROLE ,
149
132
train_instance_count = SINGLE_INSTANCE_COUNT ,
@@ -157,7 +140,8 @@ def test_fm_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_inst
157
140
sagemaker_session = sagemaker_session ,
158
141
)
159
142
160
- records = fm .record_set (train_set [0 ][:200 ], train_set [1 ][:200 ].astype ("float32" ))
143
+ training_set = datasets .one_p_mnist ()
144
+ records = fm .record_set (training_set [0 ][:200 ], training_set [1 ][:200 ].astype ("float32" ))
161
145
162
146
training_config = _build_airflow_workflow (
163
147
estimator = fm , instance_type = cpu_instance_type , inputs = records
@@ -203,12 +187,6 @@ def test_ipinsights_airflow_config_uploads_data_source_to_s3(sagemaker_session,
203
187
204
188
def test_knn_airflow_config_uploads_data_source_to_s3 (sagemaker_session , cpu_instance_type ):
205
189
with timeout (seconds = AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS ):
206
- data_path = os .path .join (DATA_DIR , "one_p_mnist" , "mnist.pkl.gz" )
207
-
208
- # Load the data into memory as numpy arrays
209
- with gzip .open (data_path , "rb" ) as f :
210
- train_set , _ , _ = pickle .load (f , encoding = "latin1" )
211
-
212
190
knn = KNN (
213
191
role = ROLE ,
214
192
train_instance_count = SINGLE_INSTANCE_COUNT ,
@@ -219,7 +197,8 @@ def test_knn_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_ins
219
197
sagemaker_session = sagemaker_session ,
220
198
)
221
199
222
- records = knn .record_set (train_set [0 ][:200 ], train_set [1 ][:200 ].astype ("float32" ))
200
+ training_set = datasets .one_p_mnist ()
201
+ records = knn .record_set (training_set [0 ][:200 ], training_set [1 ][:200 ].astype ("float32" ))
223
202
224
203
training_config = _build_airflow_workflow (
225
204
estimator = knn , instance_type = cpu_instance_type , inputs = records
@@ -273,15 +252,10 @@ def test_linearlearner_airflow_config_uploads_data_source_to_s3(
273
252
sagemaker_session , cpu_instance_type
274
253
):
275
254
with timeout (seconds = AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS ):
276
- data_path = os .path .join (DATA_DIR , "one_p_mnist" , "mnist.pkl.gz" )
277
-
278
- # Load the data into memory as numpy arrays
279
- with gzip .open (data_path , "rb" ) as f :
280
- train_set , _ , _ = pickle .load (f , encoding = "latin1" )
281
-
282
- train_set [1 ][:100 ] = 1
283
- train_set [1 ][100 :200 ] = 0
284
- train_set = train_set [0 ], train_set [1 ].astype (np .dtype ("float32" ))
255
+ training_set = datasets .one_p_mnist ()
256
+ training_set [1 ][:100 ] = 1
257
+ training_set [1 ][100 :200 ] = 0
258
+ training_set = training_set [0 ], training_set [1 ].astype (np .dtype ("float32" ))
285
259
286
260
ll = LinearLearner (
287
261
ROLE ,
@@ -326,7 +300,7 @@ def test_linearlearner_airflow_config_uploads_data_source_to_s3(
326
300
ll .early_stopping_tolerance = 0.0001
327
301
ll .early_stopping_patience = 3
328
302
329
- records = ll .record_set (train_set [0 ][:200 ], train_set [1 ][:200 ])
303
+ records = ll .record_set (training_set [0 ][:200 ], training_set [1 ][:200 ])
330
304
331
305
training_config = _build_airflow_workflow (
332
306
estimator = ll , instance_type = cpu_instance_type , inputs = records
@@ -375,12 +349,6 @@ def test_ntm_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_ins
375
349
@pytest .mark .canary_quick
376
350
def test_pca_airflow_config_uploads_data_source_to_s3 (sagemaker_session , cpu_instance_type ):
377
351
with timeout (seconds = AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS ):
378
- data_path = os .path .join (DATA_DIR , "one_p_mnist" , "mnist.pkl.gz" )
379
-
380
- # Load the data into memory as numpy arrays
381
- with gzip .open (data_path , "rb" ) as f :
382
- train_set , _ , _ = pickle .load (f , encoding = "latin1" )
383
-
384
352
pca = PCA (
385
353
role = ROLE ,
386
354
train_instance_count = SINGLE_INSTANCE_COUNT ,
@@ -393,7 +361,7 @@ def test_pca_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_ins
393
361
pca .subtract_mean = True
394
362
pca .extra_components = 5
395
363
396
- records = pca .record_set (train_set [0 ][:100 ])
364
+ records = pca .record_set (datasets . one_p_mnist () [0 ][:100 ])
397
365
398
366
training_config = _build_airflow_workflow (
399
367
estimator = pca , instance_type = cpu_instance_type , inputs = records
0 commit comments