aws · yangaws · Mar 23, 2018 · Mar 20, 2018 · Mar 20, 2018 · Mar 22, 2018
@@ -6,6 +6,7 @@ CHANGELOG
 ========
 
 * feature: Tests: create configurable ``sagemaker_session`` pytest fixture for all integration tests
+* bug-fix: AmazonEstimators: fix inaccurate hyper-parameters in kmeans, pca and linear learner
 
 1.1.2
 =====

@@ -13,7 +13,7 @@
 from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry
 from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer
 from sagemaker.amazon.hyperparameter import Hyperparameter as hp  # noqa
-from sagemaker.amazon.validation import gt, isin, ge
+from sagemaker.amazon.validation import gt, isin, ge, le
 from sagemaker.predictor import RealTimePredictor
 from sagemaker.model import Model
 from sagemaker.session import Session
@@ -27,16 +27,18 @@ class KMeans(AmazonAlgorithmEstimatorBase):
     k = hp('k', gt(1), 'An integer greater-than 1', int)
     init_method = hp('init_method', isin('random', 'kmeans++'), 'One of "random", "kmeans++"', str)
     max_iterations = hp('local_lloyd_max_iterations', gt(0), 'An integer greater-than 0', int)
-    tol = hp('local_lloyd_tol', gt(0), 'An integer greater-than 0', int)
+    tol = hp('local_lloyd_tol', (ge(0), le(1)), 'An float in [0, 1]', float)
     num_trials = hp('local_lloyd_num_trials', gt(0), 'An integer greater-than 0', int)
     local_init_method = hp('local_lloyd_init_method', isin('random', 'kmeans++'), 'One of "random", "kmeans++"', str)
     half_life_time_size = hp('half_life_time_size', ge(0), 'An integer greater-than-or-equal-to 0', int)
     epochs = hp('epochs', gt(0), 'An integer greater-than 0', int)
     center_factor = hp('extra_center_factor', gt(0), 'An integer greater-than 0', int)
+    eval_metrics = hp(name='eval_metrics', validation_message='A comma separated list of "msd" or "ssd"',
+                      data_type=list)
 
     def __init__(self, role, train_instance_count, train_instance_type, k, init_method=None,
                  max_iterations=None, tol=None, num_trials=None, local_init_method=None,
-                 half_life_time_size=None, epochs=None, center_factor=None, **kwargs):
+                 half_life_time_size=None, epochs=None, center_factor=None, eval_metrics=None, **kwargs):
         """
         A k-means clustering :class:`~sagemaker.amazon.AmazonAlgorithmEstimatorBase`. Finds k clusters of data in an
         unlabeled dataset.
@@ -70,7 +72,7 @@ def __init__(self, role, train_instance_count, train_instance_type, k, init_meth
             k (int): The number of clusters to produce.
             init_method (str): How to initialize cluster locations. One of 'random' or 'kmeans++'.
             max_iterations (int): Maximum iterations for Lloyds EM procedure in the local kmeans used in finalize stage.
-            tol (int): Tolerance for change in ssd for early stopping in local kmeans.
+            tol (float): Tolerance for change in ssd for early stopping in local kmeans.
             num_trials (int): Local version is run multiple times and the one with the best loss is chosen. This
                               determines how many times.
             local_init_method (str): Initialization method for local version. One of 'random', 'kmeans++'
@@ -82,6 +84,9 @@ def __init__(self, role, train_instance_count, train_instance_type, k, init_meth
             epochs (int): Number of passes done over the training data.
             center_factor(int): The algorithm will create ``num_clusters * extra_center_factor`` as it runs and
                 reduce the number of centers to ``k`` when finalizing
+            eval_metrics(list): JSON list of metrics types to be used for reporting the score for the model.
+                Allowed values are "msd" Means Square Error, "ssd": Sum of square distance. If test data is provided,
+                the score shall be reported in terms of all requested metrics.
             **kwargs: base class keyword argument values.
         """
         super(KMeans, self).__init__(role, train_instance_count, train_instance_type, **kwargs)
@@ -94,6 +99,7 @@ def __init__(self, role, train_instance_count, train_instance_type, k, init_meth
         self.half_life_time_size = half_life_time_size
         self.epochs = epochs
         self.center_factor = center_factor
+        self.eval_metrics = eval_metrics
 
     def create_model(self):
         """Return a :class:`~sagemaker.amazon.kmeans.KMeansModel` referencing the latest

@@ -32,7 +32,6 @@ class LinearLearner(AmazonAlgorithmEstimatorBase):
                                                     data_type=str)
     target_recall = hp('target_recall', (gt(0), lt(1)), "A float in (0,1)", float)
     target_precision = hp('target_precision', (gt(0), lt(1)), "A float in (0,1)", float)
-    positive_example_weight_mult = hp('positive_example_weight_mult', gt(0), "A float greater than 0", float)
     epochs = hp('epochs', gt(0), "An integer greater-than 0", int)
     predictor_type = hp('predictor_type', isin('binary_classifier', 'regressor'),
                         'One of "binary_classifier" or "regressor"', str)
@@ -64,9 +63,9 @@ class LinearLearner(AmazonAlgorithmEstimatorBase):
     unbias_label = hp('unbias_label', (), 'A boolean', bool)
     num_point_for_scaler = hp('num_point_for_scaler', gt(0), 'An integer greater-than 0', int)
 
-    def __init__(self, role, train_instance_count, train_instance_type, predictor_type='binary_classifier',
+    def __init__(self, role, train_instance_count, train_instance_type, predictor_type,
                  binary_classifier_model_selection_criteria=None, target_recall=None, target_precision=None,
-                 positive_example_weight_mult=None, epochs=None, use_bias=None, num_models=None,
+                 epochs=None, use_bias=None, num_models=None,
                  num_calibration_samples=None, init_method=None, init_scale=None, init_sigma=None, init_bias=None,
                  optimizer=None, loss=None, wd=None, l1=None, momentum=None, learning_rate=None, beta_1=None,
                  beta_2=None, bias_lr_mult=None, bias_wd_mult=None, use_lr_scheduler=None, lr_scheduler_step=None,
@@ -114,8 +113,6 @@ def __init__(self, role, train_instance_count, train_instance_type, predictor_ty
                 precision_at_target_recall.
             target_precision (float): Target precision. Only applicable if binary_classifier_model_selection_criteria
                 is recall_at_target_precision.
-            positive_example_weight_mult (float): The importance weight of positive examples is multiplied by this
-                constant. Useful for skewed datasets. Only applies for classification tasks.
             epochs (int): The maximum number of passes to make over the training data.
             use_bias (bool): Whether to include a bias field
             num_models (int): Number of models to train in parallel. If not set, the number of parallel models to
@@ -160,7 +157,6 @@ def __init__(self, role, train_instance_count, train_instance_type, predictor_ty
         self.binary_classifier_model_selection_criteria = binary_classifier_model_selection_criteria
         self.target_recall = target_recall
         self.target_precision = target_precision
-        self.positive_example_weight_mult = positive_example_weight_mult
         self.epochs = epochs
         self.use_bias = use_bias
         self.num_models = num_models

@@ -13,6 +13,7 @@
 from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry
 from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer
 from sagemaker.amazon.hyperparameter import Hyperparameter as hp  # noqa
+from sagemaker.amazon.validation import gt, isin
 from sagemaker.predictor import RealTimePredictor
 from sagemaker.model import Model
 from sagemaker.session import Session
@@ -25,13 +26,13 @@ class PCA(AmazonAlgorithmEstimatorBase):
 
     DEFAULT_MINI_BATCH_SIZE = 500
 
-    num_components = hp(name='num_components', validate=lambda x: x > 0,
-                        validation_message='Value must be an integer greater than zero', data_type=int)
-    algorithm_mode = hp(name='algorithm_mode', validate=lambda x: x in ['regular', 'stable', 'randomized'],
-                        validation_message='Value must be one of "regular", "stable", "randomized"', data_type=str)
+    num_components = hp('num_components', gt(0), 'Value must be an integer greater than zero', int)
+    algorithm_mode = hp('algorithm_mode', isin('regular', 'randomized'),
+                        'Value must be one of "regular" and "randomized"', str)
     subtract_mean = hp(name='subtract_mean', validation_message='Value must be a boolean', data_type=bool)
-    extra_components = hp(name='extra_components', validate=lambda x: x >= 0,
-                          validation_message="Value must be an integer greater than or equal to 0", data_type=int)
+    extra_components = hp(name='extra_components',
+                          validation_message="Value must be an integer greater than or equal to 0, or -1.",
+                          data_type=int)
 
     def __init__(self, role, train_instance_count, train_instance_type, num_components,
                  algorithm_mode=None, subtract_mean=None, extra_components=None, **kwargs):
@@ -68,12 +69,13 @@ def __init__(self, role, train_instance_count, train_instance_type, num_componen
             train_instance_count (int): Number of Amazon EC2 instances to use for training.
             train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
             num_components(int): The number of principal components. Must be greater than zero.
-            algorithm_mode (str): Mode for computing the principal components. One of 'regular', 'stable' or
+            algorithm_mode (str): Mode for computing the principal components. One of 'regular' or
                 'randomized'.
             subtract_mean (bool): Whether the data should be unbiased both during train and at inference.
             extra_components (int): As the value grows larger, the solution becomes more accurate but the
-                runtime and memory consumption increase linearly. If this value is unset, then a default value equal
-                to the maximum of 10 and num_components will be used. Valid for randomized mode only.
+                runtime and memory consumption increase linearly. If this value is unset or set to -1,
+                then a default value equal to the maximum of 10 and num_components will be used.
+                Valid for randomized mode only.
             **kwargs: base class keyword argument values.
         """
         super(PCA, self).__init__(role, train_instance_count, train_instance_type, **kwargs)

@@ -95,22 +95,6 @@ def test_data_location_does_not_call_default_bucket(sagemaker_session):
     assert not sagemaker_session.default_bucket.called
 
 
-def test_pca_hyperparameters(sagemaker_session):
-    pca = PCA(num_components=55, algorithm_mode='randomized',
-              subtract_mean=True, extra_components=33, sagemaker_session=sagemaker_session,
-              **COMMON_ARGS)
-    assert pca.hyperparameters() == dict(
-        num_components='55',
-        extra_components='33',
-        subtract_mean='True',
-        algorithm_mode='randomized')
-
-
-def test_image(sagemaker_session):
-    pca = PCA(num_components=55, sagemaker_session=sagemaker_session, **COMMON_ARGS)
-    assert pca.train_image() == registry('us-west-2') + '/pca:1'
-
-
 @patch('time.strftime', return_value=TIMESTAMP)
 def test_fit_ndarray(time, sagemaker_session):
     mock_s3 = Mock()

@@ -11,17 +11,29 @@
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
 import pytest
-from mock import Mock
+from mock import Mock, patch
 
-from sagemaker.amazon.factorization_machines import FactorizationMachines
-from sagemaker.amazon.amazon_estimator import registry
+from sagemaker.amazon.factorization_machines import FactorizationMachines, FactorizationMachinesPredictor
+from sagemaker.amazon.amazon_estimator import registry, RecordSet
 
+ROLE = 'myrole'
+TRAIN_INSTANCE_COUNT = 1
+TRAIN_INSTANCE_TYPE = 'ml.c4.xlarge'
+NUM_FACTORS = 3
+PREDICTOR_TYPE = 'regressor'
 
-COMMON_TRAIN_ARGS = {'role': 'myrole', 'train_instance_count': 1, 'train_instance_type': 'ml.c4.xlarge'}
-ALL_REQ_ARGS = dict({'num_factors': 3, 'predictor_type': 'regressor'}, **COMMON_TRAIN_ARGS)
+COMMON_TRAIN_ARGS = {'role': ROLE, 'train_instance_count': TRAIN_INSTANCE_COUNT,
+                     'train_instance_type': TRAIN_INSTANCE_TYPE}
+ALL_REQ_ARGS = dict({'num_factors': NUM_FACTORS, 'predictor_type': PREDICTOR_TYPE}, **COMMON_TRAIN_ARGS)
 
-REGION = "us-west-2"
-BUCKET_NAME = "Some-Bucket"
+REGION = 'us-west-2'
+BUCKET_NAME = 'Some-Bucket'
+
+DESCRIBE_TRAINING_JOB_RESULT = {
+    'ModelArtifacts': {
+        'S3ModelArtifacts': 's3://bucket/model.tar.gz'
+    }
+}
 
 
 @pytest.fixture()
@@ -30,6 +42,8 @@ def sagemaker_session():
     sms = Mock(name='sagemaker_session', boto_session=boto_mock)
     sms.boto_region_name = REGION
     sms.default_bucket = Mock(name='default_bucket', return_value=BUCKET_NAME)
+    sms.sagemaker_client.describe_training_job = Mock(name='describe_training_job',
+                                                      return_value=DESCRIBE_TRAINING_JOB_RESULT)
     return sms
 
 
@@ -94,3 +108,146 @@ def test_all_hyperparameters(sagemaker_session):
 def test_image(sagemaker_session):
     fm = FactorizationMachines(sagemaker_session=sagemaker_session, **ALL_REQ_ARGS)
     assert fm.train_image() == registry(REGION) + '/factorization-machines:1'
+
+
+@pytest.mark.parametrize('required_hyper_parameters, value', [
+    ('num_factors', 'string'),
+    ('predictor_type', 0)
+])
+def test_required_hyper_parameters_type(sagemaker_session, required_hyper_parameters, value):
+    with pytest.raises(ValueError):
+        test_params = ALL_REQ_ARGS.copy()
+        test_params[required_hyper_parameters] = value
+        FactorizationMachines(sagemaker_session=sagemaker_session, **test_params)
+
+
+@pytest.mark.parametrize('required_hyper_parameters, value', [
+    ('num_factors', 0),
+    ('predictor_type', 'string')
+])
+def test_required_hyper_parameters_value(sagemaker_session, required_hyper_parameters, value):
+    with pytest.raises(ValueError):
+        test_params = ALL_REQ_ARGS.copy()
+        test_params[required_hyper_parameters] = value
+        FactorizationMachines(sagemaker_session=sagemaker_session, **test_params)
+
+
+@pytest.mark.parametrize('optional_hyper_parameters, value', [
+    ('epochs', 'string'),
+    ('clip_gradient', 'string'),
+    ('eps', 'string'),
+    ('rescale_grad', 'string'),
+    ('bias_lr', 'string'),
+    ('linear_lr', 'string'),
+    ('factors_lr', 'string'),
+    ('bias_wd', 'string'),
+    ('linear_wd', 'string'),
+    ('factors_wd', 'string'),
+    ('bias_init_method', 0),
+    ('bias_init_scale', 'string'),
+    ('bias_init_sigma', 'string'),
+    ('bias_init_value', 'string'),
+    ('linear_init_method', 0),
+    ('linear_init_scale', 'string'),
+    ('linear_init_sigma', 'string'),
+    ('linear_init_value', 'string'),
+    ('factors_init_method', 0),
+    ('factors_init_scale', 'string'),
+    ('factors_init_sigma', 'string'),
+    ('factors_init_value', 'string')
+])
+def test_optional_hyper_parameters_type(sagemaker_session, optional_hyper_parameters, value):
+    with pytest.raises(ValueError):
+        test_params = ALL_REQ_ARGS.copy()
+        test_params.update({optional_hyper_parameters: value})
+        FactorizationMachines(sagemaker_session=sagemaker_session, **test_params)
+
+
+@pytest.mark.parametrize('optional_hyper_parameters, value', [
+    ('epochs', 0),
+    ('bias_lr', -1),
+    ('linear_lr', -1),
+    ('factors_lr', -1),
+    ('bias_wd', -1),
+    ('linear_wd', -1),
+    ('factors_wd', -1),
+    ('bias_init_method', 'string'),
+    ('bias_init_scale', -1),
+    ('bias_init_sigma', -1),
+    ('linear_init_method', 'string'),
+    ('linear_init_scale', -1),
+    ('linear_init_sigma', -1),
+    ('factors_init_method', 'string'),
+    ('factors_init_scale', -1),
+    ('factors_init_sigma', -1)
+])
+def test_optional_hyper_parameters_value(sagemaker_session, optional_hyper_parameters, value):
+    with pytest.raises(ValueError):
+        test_params = ALL_REQ_ARGS.copy()
+        test_params.update({optional_hyper_parameters: value})
+        FactorizationMachines(sagemaker_session=sagemaker_session, **test_params)
+
+
+PREFIX = 'prefix'
+FEATURE_DIM = 10
+MINI_BATCH_SIZE = 200
+
+
+@patch('sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit')
+def test_call_fit(base_fit, sagemaker_session):
+    fm = FactorizationMachines(base_job_name='fm', sagemaker_session=sagemaker_session, **ALL_REQ_ARGS)
+
+    data = RecordSet('s3://{}/{}'.format(BUCKET_NAME, PREFIX), num_records=1, feature_dim=FEATURE_DIM, channel='train')
+
+    fm.fit(data, MINI_BATCH_SIZE)
+
+    base_fit.assert_called_once()
+    assert len(base_fit.call_args[0]) == 2
+    assert base_fit.call_args[0][0] == data
+    assert base_fit.call_args[0][1] == MINI_BATCH_SIZE
+
+
+def test_call_fit_none_mini_batch_size(sagemaker_session):
+    fm = FactorizationMachines(base_job_name='fm', sagemaker_session=sagemaker_session, **ALL_REQ_ARGS)
+
+    data = RecordSet('s3://{}/{}'.format(BUCKET_NAME, PREFIX), num_records=1, feature_dim=FEATURE_DIM,
+                     channel='train')
+    fm.fit(data)
+
+
+def test_call_fit_wrong_type_mini_batch_size(sagemaker_session):
+    fm = FactorizationMachines(base_job_name='fm', sagemaker_session=sagemaker_session, **ALL_REQ_ARGS)
+
+    data = RecordSet('s3://{}/{}'.format(BUCKET_NAME, PREFIX), num_records=1, feature_dim=FEATURE_DIM,
+                     channel='train')
+
+    with pytest.raises((TypeError, ValueError)):
+        fm.fit(data, 'some')
+
+
+def test_call_fit_wrong_value_mini_batch_size(sagemaker_session):
+    fm = FactorizationMachines(base_job_name='fm', sagemaker_session=sagemaker_session, **ALL_REQ_ARGS)
+
+    data = RecordSet('s3://{}/{}'.format(BUCKET_NAME, PREFIX), num_records=1, feature_dim=FEATURE_DIM,
+                     channel='train')
+    with pytest.raises(ValueError):
+        fm.fit(data, 0)
+
+
+def test_model_image(sagemaker_session):
+    fm = FactorizationMachines(sagemaker_session=sagemaker_session, **ALL_REQ_ARGS)
+    data = RecordSet('s3://{}/{}'.format(BUCKET_NAME, PREFIX), num_records=1, feature_dim=FEATURE_DIM, channel='train')
+    fm.fit(data, MINI_BATCH_SIZE)
+
+    model = fm.create_model()
+    assert model.image == registry(REGION, 'factorization-machines') + '/factorization-machines:1'
+
+
+def test_predictor_type(sagemaker_session):
+    fm = FactorizationMachines(sagemaker_session=sagemaker_session, **ALL_REQ_ARGS)
+    data = RecordSet('s3://{}/{}'.format(BUCKET_NAME, PREFIX), num_records=1, feature_dim=FEATURE_DIM, channel='train')
+    fm.fit(data, MINI_BATCH_SIZE)
+    model = fm.create_model()
+    predictor = model.deploy(1, TRAIN_INSTANCE_TYPE)
+
+    assert isinstance(predictor, FactorizationMachinesPredictor)
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,7 @@ CHANGELOG @@
     ========
     * feature: Tests: create configurable ``sagemaker_session`` pytest fixture for all integration tests
+    * bug-fix: AmazonEstimators: fix inaccurate hyper-parameters in kmeans, pca and linear learner
 .1.2
     =====
@@ Expand Down @@