|
13 | 13 | from __future__ import absolute_import
|
14 | 14 |
|
15 | 15 | import gzip
|
| 16 | +import io |
| 17 | +import json |
16 | 18 | import os
|
17 | 19 | import pickle
|
18 | 20 | import sys
|
19 | 21 | import time
|
20 | 22 |
|
| 23 | +import boto3 |
21 | 24 | import numpy as np
|
22 | 25 | import pytest
|
23 | 26 |
|
24 |
| -from sagemaker import LDA, RandomCutForest |
25 |
| -from sagemaker.amazon.common import read_records |
26 |
| -from sagemaker.amazon.kmeans import KMeans |
| 27 | +from sagemaker import KMeans, LDA, RandomCutForest |
| 28 | +from sagemaker.amazon.amazon_estimator import registry |
| 29 | +from sagemaker.amazon.common import read_records, write_numpy_to_dense_tensor |
27 | 30 | from sagemaker.chainer import Chainer
|
| 31 | +from sagemaker.estimator import Estimator |
28 | 32 | from sagemaker.mxnet.estimator import MXNet
|
| 33 | +from sagemaker.predictor import json_deserializer |
29 | 34 | from sagemaker.tensorflow import TensorFlow
|
30 | 35 | from sagemaker.tuner import IntegerParameter, ContinuousParameter, CategoricalParameter, HyperparameterTuner
|
31 | 36 | from tests.integ import DATA_DIR
|
@@ -307,3 +312,83 @@ def test_tuning_chainer(sagemaker_session):
|
307 | 312 | data = np.zeros((batch_size, 28, 28), dtype='float32')
|
308 | 313 | output = predictor.predict(data)
|
309 | 314 | assert len(output) == batch_size
|
| 315 | + |
| 316 | + |
| 317 | +@pytest.mark.continuous_testing |
| 318 | +def test_tuning_byo_estimator(sagemaker_session): |
| 319 | + """Use Factorization Machines algorithm as an example here. |
| 320 | +
|
| 321 | + First we need to prepare data for training. We take standard data set, convert it to the |
| 322 | + format that the algorithm can process and upload it to S3. |
| 323 | + Then we create the Estimator and set hyperparamets as required by the algorithm. |
| 324 | + Next, we can call fit() with path to the S3. |
| 325 | + Later the trained model is deployed and prediction is called against the endpoint. |
| 326 | + Default predictor is updated with json serializer and deserializer. |
| 327 | + """ |
| 328 | + image_name = registry(sagemaker_session.boto_session.region_name) + '/factorization-machines:1' |
| 329 | + |
| 330 | + with timeout(minutes=15): |
| 331 | + data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') |
| 332 | + pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} |
| 333 | + |
| 334 | + with gzip.open(data_path, 'rb') as f: |
| 335 | + train_set, _, _ = pickle.load(f, **pickle_args) |
| 336 | + |
| 337 | + # take 100 examples for faster execution |
| 338 | + vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32') |
| 339 | + labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32') |
| 340 | + |
| 341 | + buf = io.BytesIO() |
| 342 | + write_numpy_to_dense_tensor(buf, vectors, labels) |
| 343 | + buf.seek(0) |
| 344 | + |
| 345 | + bucket = sagemaker_session.default_bucket() |
| 346 | + prefix = 'test_byo_estimator' |
| 347 | + key = 'recordio-pb-data' |
| 348 | + boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf) |
| 349 | + s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key) |
| 350 | + |
| 351 | + estimator = Estimator(image_name=image_name, |
| 352 | + role='SageMakerRole', train_instance_count=1, |
| 353 | + train_instance_type='ml.c4.xlarge', |
| 354 | + sagemaker_session=sagemaker_session, base_job_name='test-byo') |
| 355 | + |
| 356 | + estimator.set_hyperparameters(num_factors=10, |
| 357 | + feature_dim=784, |
| 358 | + mini_batch_size=100, |
| 359 | + predictor_type='binary_classifier') |
| 360 | + |
| 361 | + hyperparameter_ranges = {'mini_batch_size': IntegerParameter(100, 200)} |
| 362 | + |
| 363 | + tuner = HyperparameterTuner(estimator=estimator, base_tuning_job_name='byo', |
| 364 | + objective_metric_name='test:binary_classification_accuracy', |
| 365 | + hyperparameter_ranges=hyperparameter_ranges, |
| 366 | + max_jobs=2, max_parallel_jobs=2) |
| 367 | + |
| 368 | + tuner.fit({'train': s3_train_data, 'test': s3_train_data}, include_cls_metadata=False) |
| 369 | + |
| 370 | + print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) |
| 371 | + |
| 372 | + time.sleep(15) |
| 373 | + tuner.wait() |
| 374 | + |
| 375 | + best_training_job = tuner.best_training_job() |
| 376 | + with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): |
| 377 | + predictor = tuner.deploy(1, 'ml.m4.xlarge', endpoint_name=best_training_job) |
| 378 | + predictor.serializer = _fm_serializer |
| 379 | + predictor.content_type = 'application/json' |
| 380 | + predictor.deserializer = json_deserializer |
| 381 | + |
| 382 | + result = predictor.predict(train_set[0][:10]) |
| 383 | + |
| 384 | + assert len(result['predictions']) == 10 |
| 385 | + for prediction in result['predictions']: |
| 386 | + assert prediction['score'] is not None |
| 387 | + |
| 388 | + |
| 389 | +# Serializer for the Factorization Machines predictor (for BYO example) |
| 390 | +def _fm_serializer(data): |
| 391 | + js = {'instances': []} |
| 392 | + for row in data: |
| 393 | + js['instances'].append({'features': row.tolist()}) |
| 394 | + return json.dumps(js) |
0 commit comments