Add option for not including estimator metadata in hyperparameter tuning job (#237)

laurenyu · web-flow · commit 42f9de84c055 · 2018-06-18T10:29:26.000-07:00
Using an Amazon ML algorithm with the generic Estimator revealed that class can't be
used with an algorithm that won't accept extra (unrecognized) hyperparameters.
Since that generic class was created primarily for use with the Amazon ML algorithms that
we don't have custom estimators for, this change adds a new kwarg for not injecting
the estimator class and module in a hyperparameter tuning job.

This also includes an integ test for the BYO estimator case.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -5,6 +5,7 @@ CHANGELOG
 1.4.3dev
 ========
 * feature: Allow Local Serving of Models in S3
+* enhancement: Allow option for ``HyperparameterTuner`` to not include estimator metadata in job
 
 
 1.4.2
diff --git a/README.rst b/README.rst
@@ -321,6 +321,14 @@ In addition, the ``fit()`` call uses a list of ``RecordSet`` objects instead of
     # Start hyperparameter tuning job
     my_tuner.fit([train_records, test_records])
 
+To aid with attaching a previously-started hyperparameter tuning job with a ``HyperparameterTuner`` instance, ``fit()`` injects metadata in the hyperparameters by default.
+If the algorithm you are using cannot handle unknown hyperparameters (e.g. an Amazon ML algorithm that does not have a custom estimator in the Python SDK), then you can set ``include_cls_metadata`` to ``False`` when calling fit:
+
+.. code:: python
+
+    my_tuner.fit({'train': 's3://my_bucket/my_training_data', 'test': 's3://my_bucket_my_testing_data'},
+                 include_cls_metadata=False)
+
 There is also an analytics object associated with each ``HyperparameterTuner`` instance that presents useful information about the hyperparameter tuning job.
 For example, the ``dataframe`` method gets a pandas dataframe summarizing the associated training jobs:
 
diff --git a/src/sagemaker/tuner.py b/src/sagemaker/tuner.py
@@ -204,7 +204,7 @@ def __init__(self, estimator, objective_metric_name, hyperparameter_ranges, metr
         self._current_job_name = None
         self.latest_tuning_job = None
 
-    def _prepare_for_training(self, job_name=None):
+    def _prepare_for_training(self, job_name=None, include_cls_metadata=True):
         if job_name is not None:
             self._current_job_name = job_name
         else:
@@ -217,12 +217,12 @@ def _prepare_for_training(self, job_name=None):
 
         # For attach() to know what estimator to use for non-1P algorithms
         # (1P algorithms don't accept extra hyperparameters)
-        if not isinstance(self.estimator, AmazonAlgorithmEstimatorBase):
+        if include_cls_metadata and not isinstance(self.estimator, AmazonAlgorithmEstimatorBase):
             self.static_hyperparameters[self.SAGEMAKER_ESTIMATOR_CLASS_NAME] = json.dumps(
                 self.estimator.__class__.__name__)
             self.static_hyperparameters[self.SAGEMAKER_ESTIMATOR_MODULE] = json.dumps(self.estimator.__module__)
 
-    def fit(self, inputs, job_name=None, **kwargs):
+    def fit(self, inputs, job_name=None, include_cls_metadata=True, **kwargs):
         """Start a hyperparameter tuning job.
 
         Args:
@@ -253,7 +253,7 @@ def fit(self, inputs, job_name=None, **kwargs):
         else:
             self.estimator._prepare_for_training(job_name)
 
-        self._prepare_for_training(job_name=job_name)
+        self._prepare_for_training(job_name=job_name, include_cls_metadata=include_cls_metadata)
         self.latest_tuning_job = _TuningJob.start_new(self, inputs)
 
     @classmethod
diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py
@@ -13,19 +13,24 @@
 from __future__ import absolute_import
 
 import gzip
+import io
+import json
 import os
 import pickle
 import sys
 import time
 
+import boto3
 import numpy as np
 import pytest
 
-from sagemaker import LDA, RandomCutForest
-from sagemaker.amazon.common import read_records
-from sagemaker.amazon.kmeans import KMeans
+from sagemaker import KMeans, LDA, RandomCutForest
+from sagemaker.amazon.amazon_estimator import registry
+from sagemaker.amazon.common import read_records, write_numpy_to_dense_tensor
 from sagemaker.chainer import Chainer
+from sagemaker.estimator import Estimator
 from sagemaker.mxnet.estimator import MXNet
+from sagemaker.predictor import json_deserializer
 from sagemaker.tensorflow import TensorFlow
 from sagemaker.tuner import IntegerParameter, ContinuousParameter, CategoricalParameter, HyperparameterTuner
 from tests.integ import DATA_DIR
@@ -307,3 +312,83 @@ def test_tuning_chainer(sagemaker_session):
         data = np.zeros((batch_size, 28, 28), dtype='float32')
         output = predictor.predict(data)
         assert len(output) == batch_size
+
+
+@pytest.mark.continuous_testing
+def test_tuning_byo_estimator(sagemaker_session):
+    """Use Factorization Machines algorithm as an example here.
+
+    First we need to prepare data for training. We take standard data set, convert it to the
+    format that the algorithm can process and upload it to S3.
+    Then we create the Estimator and set hyperparamets as required by the algorithm.
+    Next, we can call fit() with path to the S3.
+    Later the trained model is deployed and prediction is called against the endpoint.
+    Default predictor is updated with json serializer and deserializer.
+    """
+    image_name = registry(sagemaker_session.boto_session.region_name) + '/factorization-machines:1'
+
+    with timeout(minutes=15):
+        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
+        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
+
+        with gzip.open(data_path, 'rb') as f:
+            train_set, _, _ = pickle.load(f, **pickle_args)
+
+        # take 100 examples for faster execution
+        vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32')
+        labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32')
+
+        buf = io.BytesIO()
+        write_numpy_to_dense_tensor(buf, vectors, labels)
+        buf.seek(0)
+
+        bucket = sagemaker_session.default_bucket()
+        prefix = 'test_byo_estimator'
+        key = 'recordio-pb-data'
+        boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
+        s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
+
+        estimator = Estimator(image_name=image_name,
+                              role='SageMakerRole', train_instance_count=1,
+                              train_instance_type='ml.c4.xlarge',
+                              sagemaker_session=sagemaker_session, base_job_name='test-byo')
+
+        estimator.set_hyperparameters(num_factors=10,
+                                      feature_dim=784,
+                                      mini_batch_size=100,
+                                      predictor_type='binary_classifier')
+
+        hyperparameter_ranges = {'mini_batch_size': IntegerParameter(100, 200)}
+
+        tuner = HyperparameterTuner(estimator=estimator, base_tuning_job_name='byo',
+                                    objective_metric_name='test:binary_classification_accuracy',
+                                    hyperparameter_ranges=hyperparameter_ranges,
+                                    max_jobs=2, max_parallel_jobs=2)
+
+        tuner.fit({'train': s3_train_data, 'test': s3_train_data}, include_cls_metadata=False)
+
+        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)
+
+        time.sleep(15)
+        tuner.wait()
+
+    best_training_job = tuner.best_training_job()
+    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
+        predictor = tuner.deploy(1, 'ml.m4.xlarge', endpoint_name=best_training_job)
+        predictor.serializer = _fm_serializer
+        predictor.content_type = 'application/json'
+        predictor.deserializer = json_deserializer
+
+        result = predictor.predict(train_set[0][:10])
+
+        assert len(result['predictions']) == 10
+        for prediction in result['predictions']:
+            assert prediction['score'] is not None
+
+
+# Serializer for the Factorization Machines predictor (for BYO example)
+def _fm_serializer(data):
+    js = {'instances': []}
+    for row in data:
+        js['instances'].append({'features': row.tolist()})
+    return json.dumps(js)
diff --git a/tests/unit/test_tuner.py b/tests/unit/test_tuner.py
@@ -159,6 +159,21 @@ def test_prepare_for_training(tuner):
     assert tuner.static_hyperparameters['sagemaker_estimator_module'] == module
 
 
+def test_prepare_for_training_with_amazon_estimator(tuner, sagemaker_session):
+    tuner.estimator = PCA(ROLE, TRAIN_INSTANCE_COUNT, TRAIN_INSTANCE_TYPE, NUM_COMPONENTS,
+                          sagemaker_session=sagemaker_session)
+
+    tuner._prepare_for_training()
+    assert 'sagemaker_estimator_class_name' not in tuner.static_hyperparameters
+    assert 'sagemaker_estimator_module' not in tuner.static_hyperparameters
+
+
+def test_prepare_for_training_dont_include_estimator_cls(tuner):
+    tuner._prepare_for_training(include_cls_metadata=False)
+    assert 'sagemaker_estimator_class_name' not in tuner.static_hyperparameters
+    assert 'sagemaker_estimator_module' not in tuner.static_hyperparameters
+
+
 def test_prepare_for_training_with_job_name(tuner):
     static_hyperparameters = {'validated': 1, 'another_one': 0}
     tuner.estimator.set_hyperparameters(**static_hyperparameters)