Merge branch 'master' into local_mode_phase1

iquintero · web-flow · commit d716d399fcdd · 2018-05-08T10:21:48.000-07:00
diff --git a/.github/issue_template.md b/.github/issue_template.md
@@ -0,0 +1,17 @@
+Please fill out the form below.
+
+### System Information
+- **Framework (e.g. TensorFlow) / Algorithm (e.g. KMeans)**:
+- **Framework Version**:
+- **Python Version**:
+- **CPU or GPU**:
+- **Python SDK Version**:
+- **Are you using a custom image**:
+
+### Describe the problem
+Describe the problem or feature request clearly here.
+
+### Minimal repo / logs
+Please provide any logs and a bare minimum reproducible test case, as this will be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.
+
+- **Exact command to reproduce**:
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,6 +2,11 @@
 CHANGELOG
 =========
 
+1.2.4
+=====
+
+* feature: Estimators: add support for Amazon Random Cut Forest algorithm
+
 1.2.3
 =========
 * bug-fix: Fix local mode not using the right s3 bucket
diff --git a/README.rst b/README.rst
@@ -47,7 +47,7 @@ You can install from source by cloning this repository and issuing a pip install
 
     git clone https://github.com/aws/sagemaker-python-sdk.git
     python setup.py sdist
-    pip install dist/sagemaker-1.2.3.tar.gz
+    pip install dist/sagemaker-1.2.4.tar.gz
 
 Supported Python versions
 ~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1575,7 +1575,7 @@ Amazon SageMaker provides several built-in machine learning algorithms that you
 
 The full list of algorithms is available on the AWS website: https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html
 
-SageMaker Python SDK includes Estimator wrappers for the AWS K-means, Principal Components Analysis(PCA), Linear Learner, Factorization Machines, Latent Dirichlet Allocation(LDA) and Neural Topic Model(NTM) algorithms.
+SageMaker Python SDK includes Estimator wrappers for the AWS K-means, Principal Components Analysis(PCA), Linear Learner, Factorization Machines, Latent Dirichlet Allocation(LDA), Neural Topic Model(NTM) and Random Cut Forest algorithms.
 
 Definition and usage
 ~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/conf.py b/doc/conf.py
@@ -19,7 +19,7 @@ def __getattr__(cls, name):
                 'numpy', 'scipy', 'scipy.sparse']
 sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 
-version = '1.2.3'
+version = '1.2.4'
 project = u'sagemaker'
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
diff --git a/doc/randomcutforest.rst b/doc/randomcutforest.rst
@@ -0,0 +1,22 @@
+Random Cut Forest
+--------------------
+
+The Amazon SageMaker Random Cut Forest algorithm.
+
+.. autoclass:: sagemaker.RandomCutForest
+    :members:
+    :undoc-members:
+    :show-inheritance:
+    :inherited-members:
+    :exclude-members: image, num_trees, num_samples_per_tree, eval_metrics, MINI_BATCH_SIZE
+
+
+.. autoclass:: sagemaker.RandomCutForestModel
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+.. autoclass:: sagemaker.RandomCutForestPredictor
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@ def read(fname):
 
 
 setup(name="sagemaker",
-      version="1.2.3",
+      version="1.2.4",
       description="Open source library for training and deploying models on Amazon SageMaker.",
       packages=find_packages('src'),
       package_dir={'': 'src'},
diff --git a/src/sagemaker/__init__.py b/src/sagemaker/__init__.py
@@ -20,6 +20,7 @@
 from sagemaker.amazon.factorization_machines import FactorizationMachines, FactorizationMachinesModel
 from sagemaker.amazon.factorization_machines import FactorizationMachinesPredictor
 from sagemaker.amazon.ntm import NTM, NTMModel, NTMPredictor
+from sagemaker.amazon.randomcutforest import RandomCutForest, RandomCutForestModel, RandomCutForestPredictor
 
 from sagemaker.local.local_session import LocalSession
 
@@ -36,5 +37,6 @@
            LinearLearnerModel, LinearLearnerPredictor,
            LDA, LDAModel, LDAPredictor,
            FactorizationMachines, FactorizationMachinesModel, FactorizationMachinesPredictor,
+           RandomCutForest, RandomCutForestModel, RandomCutForestPredictor,
            Model, NTM, NTMModel, NTMPredictor, RealTimePredictor, Session, LocalSession,
            container_def, s3_input, production_variant, get_execution_role]
diff --git a/src/sagemaker/amazon/amazon_estimator.py b/src/sagemaker/amazon/amazon_estimator.py
@@ -228,7 +228,8 @@ def upload_numpy_to_s3_shards(num_shards, s3, bucket, key_prefix, array, labels=
 
 def registry(region_name, algorithm=None):
     """Return docker registry for the given AWS region"""
-    if algorithm in [None, "pca", "kmeans", "linear-learner", "factorization-machines", "ntm"]:
+    if algorithm in [None, "pca", "kmeans", "linear-learner", "factorization-machines", "ntm",
+                     "randomcutforest"]:
         account_id = {
             "us-east-1": "382416733822",
             "us-east-2": "404615174143",
diff --git a/src/sagemaker/amazon/randomcutforest.py b/src/sagemaker/amazon/randomcutforest.py
@@ -0,0 +1,126 @@
+# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry
+from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer
+from sagemaker.amazon.hyperparameter import Hyperparameter as hp  # noqa
+from sagemaker.amazon.validation import ge, le
+from sagemaker.predictor import RealTimePredictor
+from sagemaker.model import Model
+from sagemaker.session import Session
+
+
+class RandomCutForest(AmazonAlgorithmEstimatorBase):
+
+    repo_name = 'randomcutforest'
+    repo_version = 1
+    MINI_BATCH_SIZE = 1000
+
+    eval_metrics = hp(name='eval_metrics',
+                      validation_message='A comma separated list of "accuracy" or "precision_recall_fscore"',
+                      data_type=list)
+
+    num_trees = hp('num_trees', (ge(50), le(1000)), 'An integer in [50, 1000]', int)
+    num_samples_per_tree = hp('num_samples_per_tree', (ge(1), le(2048)), 'An integer in [1, 2048]', int)
+    feature_dim = hp("feature_dim", (ge(1), le(10000)), 'An integer in [1, 10000]', int)
+
+    def __init__(self, role, train_instance_count, train_instance_type,
+                 num_samples_per_tree=None, num_trees=None, eval_metrics=None, **kwargs):
+        """RandomCutForest is :class:`Estimator` used for anomaly detection.
+
+        This Estimator may be fit via calls to
+        :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`. It requires Amazon
+        :class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to be stored in S3.
+        There is an utility :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set` that
+        can be used to upload data to S3 and creates :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed
+        to the `fit` call.
+
+        To learn more about the Amazon protobuf Record class and how to prepare bulk data in this format, please
+        consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html
+
+        After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker
+        Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an
+        Endpoint, deploy returns a :class:`~sagemaker.amazon.ntm.RandomCutForestPredictor` object that can be used
+        for inference calls using the trained model hosted in the SageMaker Endpoint.
+
+        RandomCutForest Estimators can be configured by setting hyperparameters. The available hyperparameters for
+        RandomCutForest are documented below.
+
+        For further information on the AWS Random Cut Forest algorithm,
+        please consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/randomcutforest.html
+
+        Args:
+            role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and
+                APIs that create Amazon SageMaker endpoints use this role to access
+                training data and model artifacts. After the endpoint is created,
+                the inference code might use the IAM role, if accessing AWS resource.
+            train_instance_count (int): Number of Amazon EC2 instances to use for training.
+            train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
+            num_samples_per_tree (int): Optional. The number of samples used to build each tree in the forest.
+                The total number of samples drawn from the train dataset is num_trees * num_samples_per_tree.
+            num_trees (int): Optional. The number of trees used in the forest.
+            eval_metrics(list): Optional. JSON list of metrics types to be used for reporting the score for the model.
+                Allowed values are "accuracy", "precision_recall_fscore": positive and negative precision, recall,
+                and f1 scores. If test data is provided, the score shall be reported in terms of all requested metrics.
+            **kwargs: base class keyword argument values.
+        """
+
+        super(RandomCutForest, self).__init__(role, train_instance_count, train_instance_type, **kwargs)
+        self.num_samples_per_tree = num_samples_per_tree
+        self.num_trees = num_trees
+        self.eval_metrics = eval_metrics
+
+    def create_model(self):
+        """Return a :class:`~sagemaker.amazon.RandomCutForestModel` referencing the latest
+        s3 model data produced by this Estimator."""
+
+        return RandomCutForestModel(self.model_data, self.role, sagemaker_session=self.sagemaker_session)
+
+    def fit(self, records, mini_batch_size=None, **kwargs):
+        if mini_batch_size is None:
+            mini_batch_size = RandomCutForest.MINI_BATCH_SIZE
+        elif mini_batch_size != RandomCutForest.MINI_BATCH_SIZE:
+            raise ValueError("Random Cut Forest uses a fixed mini_batch_size of {}"
+                             .format(RandomCutForest.MINI_BATCH_SIZE))
+        super(RandomCutForest, self).fit(records, mini_batch_size, **kwargs)
+
+
+class RandomCutForestPredictor(RealTimePredictor):
+    """Assigns an anomaly score to each of the datapoints provided.
+
+    The implementation of :meth:`~sagemaker.predictor.RealTimePredictor.predict` in this
+    `RealTimePredictor` requires a numpy ``ndarray`` as input. The array should contain the
+    same number of columns as the feature-dimension of the data used to fit the model this
+    Predictor performs inference on.
+
+    :meth:`predict()` returns a list of :class:`~sagemaker.amazon.record_pb2.Record` objects,
+    one for each row in the input. Each row's score is stored in the key ``score`` of the
+    ``Record.label`` field."""
+
+    def __init__(self, endpoint, sagemaker_session=None):
+        super(RandomCutForestPredictor, self).__init__(endpoint, sagemaker_session,
+                                                       serializer=numpy_to_record_serializer(),
+                                                       deserializer=record_deserializer())
+
+
+class RandomCutForestModel(Model):
+    """Reference RandomCutForest s3 model data. Calling :meth:`~sagemaker.model.Model.deploy` creates an
+    Endpoint and returns a Predictor that calculates anomaly scores for datapoints."""
+
+    def __init__(self, model_data, role, sagemaker_session=None):
+        sagemaker_session = sagemaker_session or Session()
+        repo = '{}:{}'.format(RandomCutForest.repo_name, RandomCutForest.repo_version)
+        image = '{}/{}'.format(registry(sagemaker_session.boto_session.region_name,
+                                        RandomCutForest.repo_name), repo)
+        super(RandomCutForestModel, self).__init__(model_data, image, role,
+                                                   predictor_cls=RandomCutForestPredictor,
+                                                   sagemaker_session=sagemaker_session)
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
@@ -87,7 +87,7 @@ def __init__(self, role, train_instance_count, train_instance_type,
             if self.train_instance_type == 'local_gpu' and self.train_instance_count > 1:
                 raise RuntimeError("Distributed Training in Local GPU is not supported")
 
-            self.sagemaker_session = LocalSession()
+            self.sagemaker_session = sagemaker_session or LocalSession()
         else:
             self.local_mode = False
             self.sagemaker_session = sagemaker_session or Session()
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
@@ -212,8 +212,10 @@ def train(self, image, input_mode, input_config, role, job_name, output_config,
             job_name (str): Name of the training job being created.
             output_config (dict): The S3 URI where you want to store the training results and optional KMS key ID.
             resource_config (dict): Contains values for ResourceConfig:
-            instance_count (int): Number of EC2 instances to use for training.
-            instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
+                * instance_count (int): Number of EC2 instances to use for training.
+                    The key in resource_config is 'InstanceCount'.
+                * instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
+                    The key in resource_config is 'InstanceType'.
             hyperparameters (dict): Hyperparameters for model training. The hyperparameters are made accessible as
                 a dict[str, str] to the training code on SageMaker. For convenience, this accepts other types for
                 keys and values, but ``str()`` will be called to convert them before training.
@@ -557,7 +559,7 @@ def logs_for_job(self, job_name, wait=False, poll=10):  # noqa: C901 - suppress
 
         Args:
             job_name (str): Name of the training job to display the logs for.
-            wait (bool): Whether to keep looking for new log entries until the job completes (default: True).
+            wait (bool): Whether to keep looking for new log entries until the job completes (default: False).
             poll (int): The interval in seconds between polling for new log entries and job completion (default: 5).
 
         Raises:
diff --git a/tests/integ/test_byo_estimator.py b/tests/integ/test_byo_estimator.py
@@ -42,6 +42,7 @@ def fm_serializer(data):
     return json.dumps(js)
 
 
+@pytest.mark.continuous_testing
 def test_byo_estimator(sagemaker_session, region):
     """Use Factorization Machines algorithm as an example here.
 
diff --git a/tests/integ/test_factorization_machines.py b/tests/integ/test_factorization_machines.py
@@ -16,12 +16,15 @@
 import sys
 import time
 
+import pytest
+
 from sagemaker import FactorizationMachines, FactorizationMachinesModel
 from sagemaker.utils import name_from_base
 from tests.integ import DATA_DIR
 from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
 
 
+@pytest.mark.continuous_testing
 def test_factorization_machines(sagemaker_session):
     with timeout(minutes=15):
         data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
diff --git a/tests/integ/test_kmeans.py b/tests/integ/test_kmeans.py
@@ -16,12 +16,15 @@
 import sys
 import time
 
+import pytest
+
 from sagemaker import KMeans, KMeansModel
 from sagemaker.utils import name_from_base
 from tests.integ import DATA_DIR
 from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
 
 
+@pytest.mark.continuous_testing
 def test_kmeans(sagemaker_session):
     with timeout(minutes=15):
         data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
diff --git a/tests/integ/test_lda.py b/tests/integ/test_lda.py
@@ -13,6 +13,7 @@
 import os
 
 import numpy as np
+import pytest
 
 from sagemaker import LDA, LDAModel
 from sagemaker.amazon.common import read_records
@@ -22,6 +23,7 @@
 from tests.integ.record_set import prepare_record_set_from_local_files
 
 
+@pytest.mark.continuous_testing
 def test_lda(sagemaker_session):
     with timeout(minutes=15):
         data_path = os.path.join(DATA_DIR, 'lda')
diff --git a/tests/integ/test_linear_learner.py b/tests/integ/test_linear_learner.py
@@ -17,13 +17,15 @@
 import time
 
 import numpy as np
+import pytest
 
 from sagemaker.amazon.linear_learner import LinearLearner, LinearLearnerModel
 from sagemaker.utils import name_from_base, sagemaker_timestamp
 from tests.integ import DATA_DIR
 from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
 
 
+@pytest.mark.continuous_testing
 def test_linear_learner(sagemaker_session):
     with timeout(minutes=15):
         data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
diff --git a/tests/integ/test_mxnet_train.py b/tests/integ/test_mxnet_train.py
@@ -42,6 +42,7 @@ def mxnet_training_job(sagemaker_session, mxnet_full_version):
         return mx.latest_training_job.name
 
 
+@pytest.mark.continuous_testing
 def test_attach_deploy(mxnet_training_job, sagemaker_session):
     endpoint_name = 'test-mxnet-attach-deploy-{}'.format(sagemaker_timestamp())
 
diff --git a/tests/integ/test_ntm.py b/tests/integ/test_ntm.py
@@ -13,6 +13,7 @@
 import os
 
 import numpy as np
+import pytest
 
 from sagemaker import NTM, NTMModel
 from sagemaker.amazon.common import read_records
@@ -22,6 +23,7 @@
 from tests.integ.record_set import prepare_record_set_from_local_files
 
 
+@pytest.mark.continuous_testing
 def test_ntm(sagemaker_session):
     with timeout(minutes=15):
         data_path = os.path.join(DATA_DIR, 'ntm')
diff --git a/tests/integ/test_pca.py b/tests/integ/test_pca.py
@@ -16,12 +16,15 @@
 import sys
 import time
 
+import pytest
+
 import sagemaker.amazon.pca
 from sagemaker.utils import name_from_base
 from tests.integ import DATA_DIR
 from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
 
 
+@pytest.mark.continuous_testing
 def test_pca(sagemaker_session):
     with timeout(minutes=15):
         data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
diff --git a/tests/integ/test_randomcutforest.py b/tests/integ/test_randomcutforest.py
diff --git a/tests/integ/test_tf.py b/tests/integ/test_tf.py
diff --git a/tests/integ/test_tf_cifar.py b/tests/integ/test_tf_cifar.py
diff --git a/tests/integ/timeout.py b/tests/integ/timeout.py
diff --git a/tests/unit/test_randomcutforest.py b/tests/unit/test_randomcutforest.py