Skip to content

Commit d716d39

Browse files
authored
Merge branch 'master' into local_mode_phase1
2 parents 6f1cf40 + a9717a0 commit d716d39

24 files changed

+453
-11
lines changed

.github/issue_template.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
Please fill out the form below.
2+
3+
### System Information
4+
- **Framework (e.g. TensorFlow) / Algorithm (e.g. KMeans)**:
5+
- **Framework Version**:
6+
- **Python Version**:
7+
- **CPU or GPU**:
8+
- **Python SDK Version**:
9+
- **Are you using a custom image**:
10+
11+
### Describe the problem
12+
Describe the problem or feature request clearly here.
13+
14+
### Minimal repo / logs
15+
Please provide any logs and a bare minimum reproducible test case, as this will be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.
16+
17+
- **Exact command to reproduce**:

CHANGELOG.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
CHANGELOG
33
=========
44

5+
1.2.4
6+
=====
7+
8+
* feature: Estimators: add support for Amazon Random Cut Forest algorithm
9+
510
1.2.3
611
=========
712
* bug-fix: Fix local mode not using the right s3 bucket

README.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ You can install from source by cloning this repository and issuing a pip install
4747

4848
git clone https://github.com/aws/sagemaker-python-sdk.git
4949
python setup.py sdist
50-
pip install dist/sagemaker-1.2.3.tar.gz
50+
pip install dist/sagemaker-1.2.4.tar.gz
5151

5252
Supported Python versions
5353
~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1575,7 +1575,7 @@ Amazon SageMaker provides several built-in machine learning algorithms that you
15751575
15761576
The full list of algorithms is available on the AWS website: https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html
15771577
1578-
SageMaker Python SDK includes Estimator wrappers for the AWS K-means, Principal Components Analysis(PCA), Linear Learner, Factorization Machines, Latent Dirichlet Allocation(LDA) and Neural Topic Model(NTM) algorithms.
1578+
SageMaker Python SDK includes Estimator wrappers for the AWS K-means, Principal Components Analysis(PCA), Linear Learner, Factorization Machines, Latent Dirichlet Allocation(LDA), Neural Topic Model(NTM) and Random Cut Forest algorithms.
15791579
15801580
Definition and usage
15811581
~~~~~~~~~~~~~~~~~~~~

doc/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def __getattr__(cls, name):
1919
'numpy', 'scipy', 'scipy.sparse']
2020
sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
2121

22-
version = '1.2.3'
22+
version = '1.2.4'
2323
project = u'sagemaker'
2424

2525
# Add any Sphinx extension module names here, as strings. They can be extensions

doc/randomcutforest.rst

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
Random Cut Forest
2+
--------------------
3+
4+
The Amazon SageMaker Random Cut Forest algorithm.
5+
6+
.. autoclass:: sagemaker.RandomCutForest
7+
:members:
8+
:undoc-members:
9+
:show-inheritance:
10+
:inherited-members:
11+
:exclude-members: image, num_trees, num_samples_per_tree, eval_metrics, MINI_BATCH_SIZE
12+
13+
14+
.. autoclass:: sagemaker.RandomCutForestModel
15+
:members:
16+
:undoc-members:
17+
:show-inheritance:
18+
19+
.. autoclass:: sagemaker.RandomCutForestPredictor
20+
:members:
21+
:undoc-members:
22+
:show-inheritance:

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def read(fname):
1111

1212

1313
setup(name="sagemaker",
14-
version="1.2.3",
14+
version="1.2.4",
1515
description="Open source library for training and deploying models on Amazon SageMaker.",
1616
packages=find_packages('src'),
1717
package_dir={'': 'src'},

src/sagemaker/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from sagemaker.amazon.factorization_machines import FactorizationMachines, FactorizationMachinesModel
2121
from sagemaker.amazon.factorization_machines import FactorizationMachinesPredictor
2222
from sagemaker.amazon.ntm import NTM, NTMModel, NTMPredictor
23+
from sagemaker.amazon.randomcutforest import RandomCutForest, RandomCutForestModel, RandomCutForestPredictor
2324

2425
from sagemaker.local.local_session import LocalSession
2526

@@ -36,5 +37,6 @@
3637
LinearLearnerModel, LinearLearnerPredictor,
3738
LDA, LDAModel, LDAPredictor,
3839
FactorizationMachines, FactorizationMachinesModel, FactorizationMachinesPredictor,
40+
RandomCutForest, RandomCutForestModel, RandomCutForestPredictor,
3941
Model, NTM, NTMModel, NTMPredictor, RealTimePredictor, Session, LocalSession,
4042
container_def, s3_input, production_variant, get_execution_role]

src/sagemaker/amazon/amazon_estimator.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,8 @@ def upload_numpy_to_s3_shards(num_shards, s3, bucket, key_prefix, array, labels=
228228

229229
def registry(region_name, algorithm=None):
230230
"""Return docker registry for the given AWS region"""
231-
if algorithm in [None, "pca", "kmeans", "linear-learner", "factorization-machines", "ntm"]:
231+
if algorithm in [None, "pca", "kmeans", "linear-learner", "factorization-machines", "ntm",
232+
"randomcutforest"]:
232233
account_id = {
233234
"us-east-1": "382416733822",
234235
"us-east-2": "404615174143",
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry
14+
from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer
15+
from sagemaker.amazon.hyperparameter import Hyperparameter as hp # noqa
16+
from sagemaker.amazon.validation import ge, le
17+
from sagemaker.predictor import RealTimePredictor
18+
from sagemaker.model import Model
19+
from sagemaker.session import Session
20+
21+
22+
class RandomCutForest(AmazonAlgorithmEstimatorBase):
23+
24+
repo_name = 'randomcutforest'
25+
repo_version = 1
26+
MINI_BATCH_SIZE = 1000
27+
28+
eval_metrics = hp(name='eval_metrics',
29+
validation_message='A comma separated list of "accuracy" or "precision_recall_fscore"',
30+
data_type=list)
31+
32+
num_trees = hp('num_trees', (ge(50), le(1000)), 'An integer in [50, 1000]', int)
33+
num_samples_per_tree = hp('num_samples_per_tree', (ge(1), le(2048)), 'An integer in [1, 2048]', int)
34+
feature_dim = hp("feature_dim", (ge(1), le(10000)), 'An integer in [1, 10000]', int)
35+
36+
def __init__(self, role, train_instance_count, train_instance_type,
37+
num_samples_per_tree=None, num_trees=None, eval_metrics=None, **kwargs):
38+
"""RandomCutForest is :class:`Estimator` used for anomaly detection.
39+
40+
This Estimator may be fit via calls to
41+
:meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.fit`. It requires Amazon
42+
:class:`~sagemaker.amazon.record_pb2.Record` protobuf serialized data to be stored in S3.
43+
There is an utility :meth:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase.record_set` that
44+
can be used to upload data to S3 and creates :class:`~sagemaker.amazon.amazon_estimator.RecordSet` to be passed
45+
to the `fit` call.
46+
47+
To learn more about the Amazon protobuf Record class and how to prepare bulk data in this format, please
48+
consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html
49+
50+
After this Estimator is fit, model data is stored in S3. The model may be deployed to an Amazon SageMaker
51+
Endpoint by invoking :meth:`~sagemaker.amazon.estimator.EstimatorBase.deploy`. As well as deploying an
52+
Endpoint, deploy returns a :class:`~sagemaker.amazon.ntm.RandomCutForestPredictor` object that can be used
53+
for inference calls using the trained model hosted in the SageMaker Endpoint.
54+
55+
RandomCutForest Estimators can be configured by setting hyperparameters. The available hyperparameters for
56+
RandomCutForest are documented below.
57+
58+
For further information on the AWS Random Cut Forest algorithm,
59+
please consult AWS technical documentation: https://docs.aws.amazon.com/sagemaker/latest/dg/randomcutforest.html
60+
61+
Args:
62+
role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and
63+
APIs that create Amazon SageMaker endpoints use this role to access
64+
training data and model artifacts. After the endpoint is created,
65+
the inference code might use the IAM role, if accessing AWS resource.
66+
train_instance_count (int): Number of Amazon EC2 instances to use for training.
67+
train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
68+
num_samples_per_tree (int): Optional. The number of samples used to build each tree in the forest.
69+
The total number of samples drawn from the train dataset is num_trees * num_samples_per_tree.
70+
num_trees (int): Optional. The number of trees used in the forest.
71+
eval_metrics(list): Optional. JSON list of metrics types to be used for reporting the score for the model.
72+
Allowed values are "accuracy", "precision_recall_fscore": positive and negative precision, recall,
73+
and f1 scores. If test data is provided, the score shall be reported in terms of all requested metrics.
74+
**kwargs: base class keyword argument values.
75+
"""
76+
77+
super(RandomCutForest, self).__init__(role, train_instance_count, train_instance_type, **kwargs)
78+
self.num_samples_per_tree = num_samples_per_tree
79+
self.num_trees = num_trees
80+
self.eval_metrics = eval_metrics
81+
82+
def create_model(self):
83+
"""Return a :class:`~sagemaker.amazon.RandomCutForestModel` referencing the latest
84+
s3 model data produced by this Estimator."""
85+
86+
return RandomCutForestModel(self.model_data, self.role, sagemaker_session=self.sagemaker_session)
87+
88+
def fit(self, records, mini_batch_size=None, **kwargs):
89+
if mini_batch_size is None:
90+
mini_batch_size = RandomCutForest.MINI_BATCH_SIZE
91+
elif mini_batch_size != RandomCutForest.MINI_BATCH_SIZE:
92+
raise ValueError("Random Cut Forest uses a fixed mini_batch_size of {}"
93+
.format(RandomCutForest.MINI_BATCH_SIZE))
94+
super(RandomCutForest, self).fit(records, mini_batch_size, **kwargs)
95+
96+
97+
class RandomCutForestPredictor(RealTimePredictor):
98+
"""Assigns an anomaly score to each of the datapoints provided.
99+
100+
The implementation of :meth:`~sagemaker.predictor.RealTimePredictor.predict` in this
101+
`RealTimePredictor` requires a numpy ``ndarray`` as input. The array should contain the
102+
same number of columns as the feature-dimension of the data used to fit the model this
103+
Predictor performs inference on.
104+
105+
:meth:`predict()` returns a list of :class:`~sagemaker.amazon.record_pb2.Record` objects,
106+
one for each row in the input. Each row's score is stored in the key ``score`` of the
107+
``Record.label`` field."""
108+
109+
def __init__(self, endpoint, sagemaker_session=None):
110+
super(RandomCutForestPredictor, self).__init__(endpoint, sagemaker_session,
111+
serializer=numpy_to_record_serializer(),
112+
deserializer=record_deserializer())
113+
114+
115+
class RandomCutForestModel(Model):
116+
"""Reference RandomCutForest s3 model data. Calling :meth:`~sagemaker.model.Model.deploy` creates an
117+
Endpoint and returns a Predictor that calculates anomaly scores for datapoints."""
118+
119+
def __init__(self, model_data, role, sagemaker_session=None):
120+
sagemaker_session = sagemaker_session or Session()
121+
repo = '{}:{}'.format(RandomCutForest.repo_name, RandomCutForest.repo_version)
122+
image = '{}/{}'.format(registry(sagemaker_session.boto_session.region_name,
123+
RandomCutForest.repo_name), repo)
124+
super(RandomCutForestModel, self).__init__(model_data, image, role,
125+
predictor_cls=RandomCutForestPredictor,
126+
sagemaker_session=sagemaker_session)

src/sagemaker/estimator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ def __init__(self, role, train_instance_count, train_instance_type,
8787
if self.train_instance_type == 'local_gpu' and self.train_instance_count > 1:
8888
raise RuntimeError("Distributed Training in Local GPU is not supported")
8989

90-
self.sagemaker_session = LocalSession()
90+
self.sagemaker_session = sagemaker_session or LocalSession()
9191
else:
9292
self.local_mode = False
9393
self.sagemaker_session = sagemaker_session or Session()

src/sagemaker/session.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -212,8 +212,10 @@ def train(self, image, input_mode, input_config, role, job_name, output_config,
212212
job_name (str): Name of the training job being created.
213213
output_config (dict): The S3 URI where you want to store the training results and optional KMS key ID.
214214
resource_config (dict): Contains values for ResourceConfig:
215-
instance_count (int): Number of EC2 instances to use for training.
216-
instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
215+
* instance_count (int): Number of EC2 instances to use for training.
216+
The key in resource_config is 'InstanceCount'.
217+
* instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
218+
The key in resource_config is 'InstanceType'.
217219
hyperparameters (dict): Hyperparameters for model training. The hyperparameters are made accessible as
218220
a dict[str, str] to the training code on SageMaker. For convenience, this accepts other types for
219221
keys and values, but ``str()`` will be called to convert them before training.
@@ -557,7 +559,7 @@ def logs_for_job(self, job_name, wait=False, poll=10): # noqa: C901 - suppress
557559
558560
Args:
559561
job_name (str): Name of the training job to display the logs for.
560-
wait (bool): Whether to keep looking for new log entries until the job completes (default: True).
562+
wait (bool): Whether to keep looking for new log entries until the job completes (default: False).
561563
poll (int): The interval in seconds between polling for new log entries and job completion (default: 5).
562564
563565
Raises:

tests/integ/test_byo_estimator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def fm_serializer(data):
4242
return json.dumps(js)
4343

4444

45+
@pytest.mark.continuous_testing
4546
def test_byo_estimator(sagemaker_session, region):
4647
"""Use Factorization Machines algorithm as an example here.
4748

tests/integ/test_factorization_machines.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,15 @@
1616
import sys
1717
import time
1818

19+
import pytest
20+
1921
from sagemaker import FactorizationMachines, FactorizationMachinesModel
2022
from sagemaker.utils import name_from_base
2123
from tests.integ import DATA_DIR
2224
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2325

2426

27+
@pytest.mark.continuous_testing
2528
def test_factorization_machines(sagemaker_session):
2629
with timeout(minutes=15):
2730
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')

tests/integ/test_kmeans.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,15 @@
1616
import sys
1717
import time
1818

19+
import pytest
20+
1921
from sagemaker import KMeans, KMeansModel
2022
from sagemaker.utils import name_from_base
2123
from tests.integ import DATA_DIR
2224
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2325

2426

27+
@pytest.mark.continuous_testing
2528
def test_kmeans(sagemaker_session):
2629
with timeout(minutes=15):
2730
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')

tests/integ/test_lda.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import os
1414

1515
import numpy as np
16+
import pytest
1617

1718
from sagemaker import LDA, LDAModel
1819
from sagemaker.amazon.common import read_records
@@ -22,6 +23,7 @@
2223
from tests.integ.record_set import prepare_record_set_from_local_files
2324

2425

26+
@pytest.mark.continuous_testing
2527
def test_lda(sagemaker_session):
2628
with timeout(minutes=15):
2729
data_path = os.path.join(DATA_DIR, 'lda')

tests/integ/test_linear_learner.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,15 @@
1717
import time
1818

1919
import numpy as np
20+
import pytest
2021

2122
from sagemaker.amazon.linear_learner import LinearLearner, LinearLearnerModel
2223
from sagemaker.utils import name_from_base, sagemaker_timestamp
2324
from tests.integ import DATA_DIR
2425
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2526

2627

28+
@pytest.mark.continuous_testing
2729
def test_linear_learner(sagemaker_session):
2830
with timeout(minutes=15):
2931
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')

tests/integ/test_mxnet_train.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def mxnet_training_job(sagemaker_session, mxnet_full_version):
4242
return mx.latest_training_job.name
4343

4444

45+
@pytest.mark.continuous_testing
4546
def test_attach_deploy(mxnet_training_job, sagemaker_session):
4647
endpoint_name = 'test-mxnet-attach-deploy-{}'.format(sagemaker_timestamp())
4748

tests/integ/test_ntm.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import os
1414

1515
import numpy as np
16+
import pytest
1617

1718
from sagemaker import NTM, NTMModel
1819
from sagemaker.amazon.common import read_records
@@ -22,6 +23,7 @@
2223
from tests.integ.record_set import prepare_record_set_from_local_files
2324

2425

26+
@pytest.mark.continuous_testing
2527
def test_ntm(sagemaker_session):
2628
with timeout(minutes=15):
2729
data_path = os.path.join(DATA_DIR, 'ntm')

tests/integ/test_pca.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,15 @@
1616
import sys
1717
import time
1818

19+
import pytest
20+
1921
import sagemaker.amazon.pca
2022
from sagemaker.utils import name_from_base
2123
from tests.integ import DATA_DIR
2224
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2325

2426

27+
@pytest.mark.continuous_testing
2528
def test_pca(sagemaker_session):
2629
with timeout(minutes=15):
2730
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')

0 commit comments

Comments
 (0)