Minimal changes to demonstrate creation of HPO job. (aws#8)

lukmis · web-flow · commit 9b9272b9c991 · 2018-05-14T09:07:29.000-07:00
diff --git a/src/sagemaker/amazon/amazon_estimator.py b/src/sagemaker/amazon/amazon_estimator.py
@@ -53,6 +53,9 @@ def train_image(self):
     def hyperparameters(self):
         return hp.serialize_all(self)
 
+    def hpo_hyperparameters(self):
+        return hp.serialize_all_hpo(self)
+
     @property
     def data_location(self):
         return self._data_location
diff --git a/src/sagemaker/amazon/hyperparameter.py b/src/sagemaker/amazon/hyperparameter.py
@@ -112,5 +112,5 @@ def serialize_all_hpo(obj):
         for range_type in _HpoParameter.__all_types__:
             parameter_range = [param.as_hpo_range(p_name)
                                for p_name, param in obj._hpo_parameters.items() if param.__name__ == range_type]
-            parameter_ranges[range_type+'ParameterRange'] = parameter_range
+            parameter_ranges[range_type+'ParameterRanges'] = parameter_range
         return parameter_ranges
diff --git a/src/sagemaker/amazon/kmeans.py b/src/sagemaker/amazon/kmeans.py
@@ -89,7 +89,32 @@ def __init__(self, role, train_instance_count, train_instance_type, k, init_meth
                 the score shall be reported in terms of all requested metrics.
             **kwargs: base class keyword argument values.
         """
-        super(KMeans, self).__init__(role, train_instance_count, train_instance_type, **kwargs)
+        # TODO: shouldn't be defined here, delete this once HPO fixes validation
+        metric_definitions = [
+            {
+                "Name": "test:msd",
+                "Regex": "#quality_metric: host=\\S+, test msd <loss>=(\\S+)"
+            },
+            {
+                "Name": "test:ssd",
+                "Regex": "#quality_metric: host=\\S+, test ssd <loss>=(\\S+)"
+            },
+            {
+                "Name": "train:msd",
+                "Regex": "#quality_metric: host=\\S+, train msd <loss>=(\\S+)"
+            },
+            {
+                "Name": "train:progress",
+                "Regex": "#progress_metric: host=\\S+, completed (\\S+) %"
+            },
+            # updated below basing on current log format
+            {
+                "Name": "train:throughput",
+                "Regex": "#throughput_metric: train throughput in records/second: (\\S+)"
+            }
+        ]
+        super(KMeans, self).__init__(role, train_instance_count, train_instance_type,
+                                     metric_definitions=metric_definitions, **kwargs)
         self.k = k
         self.init_method = init_method
         self.max_iterations = max_iterations
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
@@ -43,7 +43,8 @@ class EstimatorBase(with_metaclass(ABCMeta, object)):
 
     def __init__(self, role, train_instance_count, train_instance_type,
                  train_volume_size=30, train_max_run=24 * 60 * 60, input_mode='File',
-                 output_path=None, output_kms_key=None, base_job_name=None, sagemaker_session=None):
+                 output_path=None, output_kms_key=None, base_job_name=None, sagemaker_session=None,
+                 metric_definitions=None):
         """Initialize an ``EstimatorBase`` instance.
 
         Args:
@@ -72,6 +73,7 @@ def __init__(self, role, train_instance_count, train_instance_type,
             sagemaker_session (sagemaker.session.Session): Session object which manages interactions with
                 Amazon SageMaker APIs and any other AWS services needed. If not specified, the estimator creates one
                 using the default AWS configuration chain.
+            metric_definitions (list[dict]): Metrics definition with 'name' and 'regex' keys.
         """
         self.role = role
         self.train_instance_count = train_instance_count
@@ -95,6 +97,7 @@ def __init__(self, role, train_instance_count, train_instance_type,
         self.output_path = output_path
         self.output_kms_key = output_kms_key
         self.latest_training_job = None
+        self.metric_definitions = metric_definitions
 
     @abstractmethod
     def train_image(self):
diff --git a/src/sagemaker/hpo.py b/src/sagemaker/hpo.py
@@ -11,6 +11,9 @@
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
 
+from sagemaker.estimator import _TrainingJob
+from sagemaker.utils import base_name_from_image, name_from_base
+
 
 class _HpoParameter(object):
     __all_types__ = ['Continuous', 'Categorical', 'Integer']
@@ -21,7 +24,6 @@ def __init__(self, min_value, max_value):
 
     def as_hpo_range(self, name):
         return {'Name': name,
-                'Type': self.__name__,
                 'MinValue': str(self.min_value),
                 'MaxValue': str(self.max_value)}
 
@@ -44,7 +46,6 @@ def __init__(self, values):
 
     def as_hpo_range(self, name):
         return {'Name': name,
-                'Type': self.__name__,
                 'Values': self.values}
 
 
@@ -53,3 +54,80 @@ class IntegerParameter(_HpoParameter):
 
     def __init__(self, min_value, max_value):
         super(IntegerParameter, self).__init__(min_value, max_value)
+
+
+class HyperparameterTuner(object):
+    __objectives__ = ['Minimize', 'Maximize']
+
+    def __init__(self, objective='Maximize', max_jobs=1, max_parallel_jobs=1):
+        if objective not in HyperparameterTuner.__objectives__:
+            raise ValueError("Unsupported 'objective' values")
+        self.strategy = 'Bayesian'
+        self.objective = objective
+        self.max_jobs = max_jobs
+        self.max_parallel_jobs = max_parallel_jobs
+
+    def tune(self, estimator, inputs, metric_name):  # ,hyperparameters
+        # self.optimize_hp = hyperparameters
+        self.optimize_metric_name = metric_name
+        self.estimator = estimator
+
+        self.latest_tuning_job = _TuningJob.start_new(self, inputs)
+
+
+class _TuningJob(_TrainingJob):
+    def __init__(self, sagemaker_session, tuning_job_name):
+        self.sagemaker_session = sagemaker_session
+        self.tuning_job_name = tuning_job_name
+
+    @classmethod
+    def start_new(cls, tuner, inputs):
+        """Create a new Amazon SageMaker HPO tuning job from the HyperparameterTuner.
+
+        Args:
+            tuner (sagemaker.hpo.HyperparameterTuner): Tuner object created by the user.
+            inputs (str): Parameters used when called  :meth:`~sagemaker.estimator.EstimatorBase.fit`.
+
+        Returns:
+            sagemaker.hpo._TuningJob: Constructed object that captures all information about the started job.
+        """
+
+        input_config = _TrainingJob._format_inputs_to_input_config(inputs)
+        role = tuner.estimator.sagemaker_session.expand_role(tuner.estimator.role)
+        output_config = _TrainingJob._prepare_output_config(tuner.estimator.output_path, tuner.estimator.output_kms_key)
+        resource_config = _TrainingJob._prepare_resource_config(tuner.estimator.train_instance_count,
+                                                                tuner.estimator.train_instance_type,
+                                                                tuner.estimator.train_volume_size)
+        stop_condition = _TrainingJob._prepare_stopping_condition(tuner.estimator.train_max_run)
+
+        if tuner.estimator.hyperparameters() is None:
+            raise ValueError('Cannot tune estimator without hyperparameters')
+
+        # TODO: current code path only works for 1P, update with somthiong like this for others?
+        # split hyperparameters defined in estimator into static and hpo-controlled parts
+        # static_hp = {str(k): str(v) for (k, v) in tuner.estimator.hyperparameters().items()}
+        # for hp_name in tuner.optimize_hp.keys():
+        #     del static_hp[hp_name]
+
+        # make sure the job name is unique for each invocation, honor supplied base_job_name or generate it
+        # TODO: shall the tuner have separate logic/code for the base name?
+        base_name = tuner.estimator.base_job_name or base_name_from_image(tuner.estimator.train_image())
+        hpo_job_name = name_from_base(base_name)
+
+        tuner.estimator.sagemaker_session.tune(job_name=hpo_job_name, strategy=tuner.strategy,
+                                               objective=tuner.objective, metric_name=tuner.optimize_metric_name,
+                                               max_jobs=tuner.max_jobs, max_parallel_jobs=tuner.max_parallel_jobs,
+                                               parameter_ranges=tuner.estimator.hpo_hyperparameters(),
+                                               static_hp=tuner.estimator.hyperparameters(),
+                                               image=tuner.estimator.train_image(),
+                                               input_mode=tuner.estimator.input_mode,
+                                               metric_definitions=tuner.estimator.metric_definitions,
+                                               role=role, input_config=input_config,
+                                               output_config=output_config, resource_config=resource_config,
+                                               stop_condition=stop_condition)
+
+        return cls(tuner.estimator.sagemaker_session, hpo_job_name)
+
+    @property
+    def name(self):
+        return self.tuning_job_name
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
@@ -243,6 +243,82 @@ def train(self, image, input_mode, input_config, role, job_name, output_config,
         LOGGER.debug('train request: {}'.format(json.dumps(train_request, indent=4)))
         self.sagemaker_client.create_training_job(**train_request)
 
+    def tune(self, job_name, strategy, objective, metric_name,
+             max_jobs, max_parallel_jobs, parameter_ranges,
+             static_hp, image, input_mode, metric_definitions,
+             role, input_config,  output_config, resource_config, stop_condition):
+        """Create an Amazon SageMaker HPO job.
+
+        Args:
+            job_name (str): Name of the tuning job being created.
+            strategy (str): Strategy to be used.
+            objective (str): Minimize/Maximize
+            metric_name (str): Name of the metric to use when evaluating training job.
+            max_jobs (int): Maximum total number of jobs to start.
+            max_parallel_jobs (int): Maximum number of parallel jobs to start.
+            parameter_ranges (dict): Parameter ranges in a dictionary of types: Continuous, Integer, Categorical
+            static_hp (dict): Hyperparameters for model training. The hyperparameters are made accessible as
+                a dict[str, str] to the training code on SageMaker. For convenience, this accepts other types for
+                keys and values, but ``str()`` will be called to convert them before training.
+            image (str): Docker image containing training code.
+            input_mode (str): The input mode that the algorithm supports. Valid modes:
+
+                * 'File' - Amazon SageMaker copies the training dataset from the S3 location to
+                    a directory in the Docker container.
+                * 'Pipe' - Amazon SageMaker streams data directly from S3 to the container via a Unix-named pipe.
+            metric_definitions (str):
+            role (str): An AWS IAM role (either name or full ARN). The Amazon SageMaker training jobs and APIs
+                that create Amazon SageMaker endpoints use this role to access training data and model artifacts.
+                You must grant sufficient permissions to this role.
+            input_config (list): A list of Channel objects. Each channel is a named input source. Please refer to
+                 the format details described:
+                 https://botocore.readthedocs.io/en/latest/reference/services/sagemaker.html#SageMaker.Client.create_training_job
+            output_config (dict): The S3 URI where you want to store the training results and optional KMS key ID.
+            resource_config (dict): Contains values for ResourceConfig:
+            instance_count (int): Number of EC2 instances to use for training.
+            instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
+            stop_condition (dict): Defines when training shall finish. Contains entries that can be understood by the
+                service like ``MaxRuntimeInSeconds``.
+
+        Returns:
+
+        """
+
+        tune_request = {
+            'HyperParameterTuningJobName': job_name,
+            'HyperParameterTuningJobConfig': {
+                'Strategy': strategy,
+                'HyperParameterTuningJobObjective': {
+                    'Type': objective,
+                    'MetricName': metric_name,
+                },
+                'ResourceLimits': {
+                    'MaxNumberOfTrainingJobs': max_jobs,
+                    'MaxParallelTrainingJobs': max_parallel_jobs
+                },
+                'ParameterRanges': parameter_ranges
+            },
+            'TrainingJobDefinition': {
+                'StaticHyperParameters': static_hp,
+                'AlgorithmSpecification': {
+                    'TrainingImage': image,
+                    'TrainingInputMode': input_mode
+                },
+                'RoleArn': role,
+                'InputDataConfig': input_config,
+                'OutputDataConfig': output_config,
+                'ResourceConfig': resource_config,
+                'StoppingCondition': stop_condition,
+            }
+        }
+
+        if metric_definitions is not None:
+            tune_request['TrainingJobDefinition']['AlgorithmSpecification']['MetricDefinitions'] = metric_definitions
+
+        LOGGER.info('Creating tuning-job with name: {}'.format(job_name))
+        LOGGER.debug('tune request: {}'.format(json.dumps(tune_request, indent=4)))
+        self.sagemaker_client.create_hyper_parameter_tuning_job(**tune_request)
+
     def create_model(self, name, role, primary_container):
         """Create an Amazon SageMaker ``Model``.
 
@@ -800,6 +876,29 @@ def _train_done(sagemaker_client, job_name):
     return desc
 
 
+def _tune_done(sagemaker_client, job_name):
+    tuning_status_codes = {
+        'Completed': '!',
+        'InProgress': '.',
+        'Failed': '*',
+        'Stopped': 's',
+        'Stopping': '_'
+    }
+    in_progress_statuses = ['InProgress', 'Stopping']
+
+    desc = sagemaker_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=job_name)
+    status = desc['HyperParameterTuningJobStatus']
+
+    print(tuning_status_codes.get(status, '?'), end='')
+    sys.stdout.flush()
+
+    if status in in_progress_statuses:
+        return None
+
+    print('')
+    return desc
+
+
 def _deploy_done(sagemaker_client, endpoint_name):
     hosting_status_codes = {
         "OutOfService": "x",
diff --git a/tests/integ/test_hpo.py b/tests/integ/test_hpo.py
@@ -0,0 +1,65 @@
+# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+import gzip
+import os
+import pickle
+import sys
+
+from sagemaker.amazon.kmeans import KMeans
+from sagemaker.hpo import IntegerParameter, ContinuousParameter, CategoricalParameter, HyperparameterTuner
+from sagemaker.session import s3_input
+
+from tests.integ import DATA_DIR
+
+
+def test_hpo(sagemaker_session):
+    tuner = HyperparameterTuner(objective='Minimize', max_jobs=8, max_parallel_jobs=2)
+
+    data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
+    pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
+
+    # Load the data into memory as numpy arrays
+    with gzip.open(data_path, 'rb') as f:
+        train_set, _, _ = pickle.load(f, **pickle_args)
+
+    kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
+                    train_instance_type='ml.c4.xlarge',
+                    k=10, sagemaker_session=sagemaker_session, base_job_name='tk',
+                    output_path='s3://{}/'.format(sagemaker_session.default_bucket()))
+
+    # set kmeans specific hp
+    kmeans.init_method = 'random'
+    kmeans.max_iterators = 1
+    kmeans.tol = 1
+    kmeans.num_trials = 1
+    kmeans.local_init_method = 'kmeans++'
+    kmeans.half_life_time_size = 1
+    kmeans.epochs = 1
+
+    records = kmeans.record_set(train_set[0][:100])
+
+    # TODO: this is done during fit() need to refactor that
+    kmeans.mini_batch_size = 5000
+    kmeans.feature_dim = records.feature_dim
+
+    # specify which hp you want to optimize over
+    kmeans.center_factor = IntegerParameter(1, 10)
+    kmeans.mini_batch_size = IntegerParameter(10, 100)
+    kmeans.tol = ContinuousParameter(1.0, 2.0)
+    kmeans.local_init_method = CategoricalParameter(['kmeans++', 'random'])
+    
+    data = {records.channel: s3_input(records.s3_data, distribution='ShardedByS3Key',
+                                      s3_data_type=records.s3_data_type)}
+    tuner.tune(kmeans, data, 'train:throughput')
+
+    print ('Started HPO job with name:' + tuner.latest_tuning_job.name)
diff --git a/tests/unit/test_hpo.py b/tests/unit/test_hpo.py
diff --git a/tests/unit/test_hyperparameter.py b/tests/unit/test_hyperparameter.py