Skip to content

Update unit tests of kmeans, pca, factorization machines, lda and ntm #103

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ CHANGELOG
========

* feature: Tests: create configurable ``sagemaker_session`` pytest fixture for all integration tests
* bug-fix: AmazonEstimators: fix inaccurate hyper-parameters in kmeans, pca and linear learner

1.1.2
=====
Expand Down
14 changes: 10 additions & 4 deletions src/sagemaker/amazon/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry
from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer
from sagemaker.amazon.hyperparameter import Hyperparameter as hp # noqa
from sagemaker.amazon.validation import gt, isin, ge
from sagemaker.amazon.validation import gt, isin, ge, le
from sagemaker.predictor import RealTimePredictor
from sagemaker.model import Model
from sagemaker.session import Session
Expand All @@ -27,16 +27,18 @@ class KMeans(AmazonAlgorithmEstimatorBase):
k = hp('k', gt(1), 'An integer greater-than 1', int)
init_method = hp('init_method', isin('random', 'kmeans++'), 'One of "random", "kmeans++"', str)
max_iterations = hp('local_lloyd_max_iterations', gt(0), 'An integer greater-than 0', int)
tol = hp('local_lloyd_tol', gt(0), 'An integer greater-than 0', int)
tol = hp('local_lloyd_tol', (ge(0), le(1)), 'An float in [0, 1]', float)
num_trials = hp('local_lloyd_num_trials', gt(0), 'An integer greater-than 0', int)
local_init_method = hp('local_lloyd_init_method', isin('random', 'kmeans++'), 'One of "random", "kmeans++"', str)
half_life_time_size = hp('half_life_time_size', ge(0), 'An integer greater-than-or-equal-to 0', int)
epochs = hp('epochs', gt(0), 'An integer greater-than 0', int)
center_factor = hp('extra_center_factor', gt(0), 'An integer greater-than 0', int)
eval_metrics = hp(name='eval_metrics', validation_message='A comma separated list of "msd" or "ssd"',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where did you get comma separated list from? The API docs seem to imply just one value: https://docs.aws.amazon.com/sagemaker/latest/dg/k-means-api-config.html

If the API docs are wrong, can you ask the algorithms team to fix the docs?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked with alg owner. This 'eval_metrics' should be a list. I have asked them to update the api doc.

data_type=list)

def __init__(self, role, train_instance_count, train_instance_type, k, init_method=None,
max_iterations=None, tol=None, num_trials=None, local_init_method=None,
half_life_time_size=None, epochs=None, center_factor=None, **kwargs):
half_life_time_size=None, epochs=None, center_factor=None, eval_metrics=None, **kwargs):
"""
A k-means clustering :class:`~sagemaker.amazon.AmazonAlgorithmEstimatorBase`. Finds k clusters of data in an
unlabeled dataset.
Expand Down Expand Up @@ -70,7 +72,7 @@ def __init__(self, role, train_instance_count, train_instance_type, k, init_meth
k (int): The number of clusters to produce.
init_method (str): How to initialize cluster locations. One of 'random' or 'kmeans++'.
max_iterations (int): Maximum iterations for Lloyds EM procedure in the local kmeans used in finalize stage.
tol (int): Tolerance for change in ssd for early stopping in local kmeans.
tol (float): Tolerance for change in ssd for early stopping in local kmeans.
num_trials (int): Local version is run multiple times and the one with the best loss is chosen. This
determines how many times.
local_init_method (str): Initialization method for local version. One of 'random', 'kmeans++'
Expand All @@ -82,6 +84,9 @@ def __init__(self, role, train_instance_count, train_instance_type, k, init_meth
epochs (int): Number of passes done over the training data.
center_factor(int): The algorithm will create ``num_clusters * extra_center_factor`` as it runs and
reduce the number of centers to ``k`` when finalizing
eval_metrics(list): JSON list of metrics types to be used for reporting the score for the model.
Allowed values are "msd" Means Square Error, "ssd": Sum of square distance. If test data is provided,
the score shall be reported in terms of all requested metrics.
**kwargs: base class keyword argument values.
"""
super(KMeans, self).__init__(role, train_instance_count, train_instance_type, **kwargs)
Expand All @@ -94,6 +99,7 @@ def __init__(self, role, train_instance_count, train_instance_type, k, init_meth
self.half_life_time_size = half_life_time_size
self.epochs = epochs
self.center_factor = center_factor
self.eval_metrics = eval_metrics

def create_model(self):
"""Return a :class:`~sagemaker.amazon.kmeans.KMeansModel` referencing the latest
Expand Down
8 changes: 2 additions & 6 deletions src/sagemaker/amazon/linear_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ class LinearLearner(AmazonAlgorithmEstimatorBase):
data_type=str)
target_recall = hp('target_recall', (gt(0), lt(1)), "A float in (0,1)", float)
target_precision = hp('target_precision', (gt(0), lt(1)), "A float in (0,1)", float)
positive_example_weight_mult = hp('positive_example_weight_mult', gt(0), "A float greater than 0", float)
epochs = hp('epochs', gt(0), "An integer greater-than 0", int)
predictor_type = hp('predictor_type', isin('binary_classifier', 'regressor'),
'One of "binary_classifier" or "regressor"', str)
Expand Down Expand Up @@ -64,9 +63,9 @@ class LinearLearner(AmazonAlgorithmEstimatorBase):
unbias_label = hp('unbias_label', (), 'A boolean', bool)
num_point_for_scaler = hp('num_point_for_scaler', gt(0), 'An integer greater-than 0', int)

def __init__(self, role, train_instance_count, train_instance_type, predictor_type='binary_classifier',
def __init__(self, role, train_instance_count, train_instance_type, predictor_type,
binary_classifier_model_selection_criteria=None, target_recall=None, target_precision=None,
positive_example_weight_mult=None, epochs=None, use_bias=None, num_models=None,
epochs=None, use_bias=None, num_models=None,
num_calibration_samples=None, init_method=None, init_scale=None, init_sigma=None, init_bias=None,
optimizer=None, loss=None, wd=None, l1=None, momentum=None, learning_rate=None, beta_1=None,
beta_2=None, bias_lr_mult=None, bias_wd_mult=None, use_lr_scheduler=None, lr_scheduler_step=None,
Expand Down Expand Up @@ -114,8 +113,6 @@ def __init__(self, role, train_instance_count, train_instance_type, predictor_ty
precision_at_target_recall.
target_precision (float): Target precision. Only applicable if binary_classifier_model_selection_criteria
is recall_at_target_precision.
positive_example_weight_mult (float): The importance weight of positive examples is multiplied by this
constant. Useful for skewed datasets. Only applies for classification tasks.
epochs (int): The maximum number of passes to make over the training data.
use_bias (bool): Whether to include a bias field
num_models (int): Number of models to train in parallel. If not set, the number of parallel models to
Expand Down Expand Up @@ -160,7 +157,6 @@ def __init__(self, role, train_instance_count, train_instance_type, predictor_ty
self.binary_classifier_model_selection_criteria = binary_classifier_model_selection_criteria
self.target_recall = target_recall
self.target_precision = target_precision
self.positive_example_weight_mult = positive_example_weight_mult
self.epochs = epochs
self.use_bias = use_bias
self.num_models = num_models
Expand Down
20 changes: 11 additions & 9 deletions src/sagemaker/amazon/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry
from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer
from sagemaker.amazon.hyperparameter import Hyperparameter as hp # noqa
from sagemaker.amazon.validation import gt, isin
from sagemaker.predictor import RealTimePredictor
from sagemaker.model import Model
from sagemaker.session import Session
Expand All @@ -25,13 +26,13 @@ class PCA(AmazonAlgorithmEstimatorBase):

DEFAULT_MINI_BATCH_SIZE = 500

num_components = hp(name='num_components', validate=lambda x: x > 0,
validation_message='Value must be an integer greater than zero', data_type=int)
algorithm_mode = hp(name='algorithm_mode', validate=lambda x: x in ['regular', 'stable', 'randomized'],
validation_message='Value must be one of "regular", "stable", "randomized"', data_type=str)
num_components = hp('num_components', gt(0), 'Value must be an integer greater than zero', int)
algorithm_mode = hp('algorithm_mode', isin('regular', 'randomized'),
'Value must be one of "regular" and "randomized"', str)
subtract_mean = hp(name='subtract_mean', validation_message='Value must be a boolean', data_type=bool)
extra_components = hp(name='extra_components', validate=lambda x: x >= 0,
validation_message="Value must be an integer greater than or equal to 0", data_type=int)
extra_components = hp(name='extra_components',
validation_message="Value must be an integer greater than or equal to 0, or -1.",
data_type=int)

def __init__(self, role, train_instance_count, train_instance_type, num_components,
algorithm_mode=None, subtract_mean=None, extra_components=None, **kwargs):
Expand Down Expand Up @@ -68,12 +69,13 @@ def __init__(self, role, train_instance_count, train_instance_type, num_componen
train_instance_count (int): Number of Amazon EC2 instances to use for training.
train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
num_components(int): The number of principal components. Must be greater than zero.
algorithm_mode (str): Mode for computing the principal components. One of 'regular', 'stable' or
algorithm_mode (str): Mode for computing the principal components. One of 'regular' or
'randomized'.
subtract_mean (bool): Whether the data should be unbiased both during train and at inference.
extra_components (int): As the value grows larger, the solution becomes more accurate but the
runtime and memory consumption increase linearly. If this value is unset, then a default value equal
to the maximum of 10 and num_components will be used. Valid for randomized mode only.
runtime and memory consumption increase linearly. If this value is unset or set to -1,
then a default value equal to the maximum of 10 and num_components will be used.
Valid for randomized mode only.
**kwargs: base class keyword argument values.
"""
super(PCA, self).__init__(role, train_instance_count, train_instance_type, **kwargs)
Expand Down
8 changes: 2 additions & 6 deletions tests/integ/test_linear_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,11 @@ def test_linear_learner(sagemaker_session):
train_set = train_set[0], train_set[1].astype(np.dtype('float32'))

ll = LinearLearner('SageMakerRole', 1, 'ml.c4.2xlarge', base_job_name='test-linear-learner',
sagemaker_session=sagemaker_session)
predictor_type='binary_classifier', sagemaker_session=sagemaker_session)
ll.binary_classifier_model_selection_criteria = 'accuracy'
ll.target_recall = 0.5
ll.target_precision = 0.5
ll.positive_example_weight_mult = 0.1
ll.epochs = 1
ll.predictor_type = 'binary_classifier'
ll.use_bias = True
ll.num_models = 1
ll.num_calibration_samples = 1
Expand Down Expand Up @@ -100,13 +98,11 @@ def test_async_linear_learner(sagemaker_session):
train_set = train_set[0], train_set[1].astype(np.dtype('float32'))

ll = LinearLearner('SageMakerRole', 1, 'ml.c4.2xlarge', base_job_name='test-linear-learner',
sagemaker_session=sagemaker_session)
predictor_type='binary_classifier', sagemaker_session=sagemaker_session)
ll.binary_classifier_model_selection_criteria = 'accuracy'
ll.target_recall = 0.5
ll.target_precision = 0.5
ll.positive_example_weight_mult = 0.1
ll.epochs = 1
ll.predictor_type = 'binary_classifier'
ll.use_bias = True
ll.num_models = 1
ll.num_calibration_samples = 1
Expand Down
16 changes: 0 additions & 16 deletions tests/unit/test_amazon_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,22 +95,6 @@ def test_data_location_does_not_call_default_bucket(sagemaker_session):
assert not sagemaker_session.default_bucket.called


def test_pca_hyperparameters(sagemaker_session):
pca = PCA(num_components=55, algorithm_mode='randomized',
subtract_mean=True, extra_components=33, sagemaker_session=sagemaker_session,
**COMMON_ARGS)
assert pca.hyperparameters() == dict(
num_components='55',
extra_components='33',
subtract_mean='True',
algorithm_mode='randomized')


def test_image(sagemaker_session):
pca = PCA(num_components=55, sagemaker_session=sagemaker_session, **COMMON_ARGS)
assert pca.train_image() == registry('us-west-2') + '/pca:1'


@patch('time.strftime', return_value=TIMESTAMP)
def test_fit_ndarray(time, sagemaker_session):
mock_s3 = Mock()
Expand Down
Loading