Fix multiple channel (aws#45)

ChoiByungWook · web-flow · commit f9838b6eca40 · 2018-05-30T21:35:10.000-07:00
diff --git a/src/sagemaker/amazon/amazon_estimator.py b/src/sagemaker/amazon/amazon_estimator.py
@@ -19,7 +19,7 @@
 from sagemaker.amazon import validation
 from sagemaker.amazon.hyperparameter import Hyperparameter as hp  # noqa
 from sagemaker.amazon.common import write_numpy_to_dense_tensor
-from sagemaker.estimator import EstimatorBase
+from sagemaker.estimator import EstimatorBase, _TrainingJob
 from sagemaker.session import s3_input
 from sagemaker.utils import sagemaker_timestamp
 
@@ -104,10 +104,22 @@ def prepare_for_training(self, records, mini_batch_size=None, job_name=None):
         """
         super(AmazonAlgorithmEstimatorBase, self).prepare_for_training(job_name=job_name)
 
-        self.feature_dim = records.feature_dim
+        feature_dim = None
+
+        if isinstance(records, list):
+            for record in records:
+                if record.channel == 'train':
+                    feature_dim = record.feature_dim
+                    break
+            if feature_dim is None:
+                raise ValueError('Must provide train channel.')
+        else:
+            feature_dim = records.feature_dim
+
+        self.feature_dim = feature_dim
         self.mini_batch_size = mini_batch_size
 
-    def fit(self, records, mini_batch_size=None, **kwargs):
+    def fit(self, records, mini_batch_size=None, wait=True, logs=True, job_name=None):
         """Fit this Estimator on serialized Record objects, stored in S3.
 
         ``records`` should be an instance of :class:`~RecordSet`. This defines a collection of
@@ -127,9 +139,17 @@ def fit(self, records, mini_batch_size=None, **kwargs):
             records (:class:`~RecordSet`): The records to train this ``Estimator`` on
             mini_batch_size (int or None): The size of each mini-batch to use when training. If ``None``, a
                 default value will be used.
+            wait (bool): Whether the call should wait until the job completes (default: True).
+            logs (bool): Whether to show the logs produced by the job.
+                Only meaningful when wait is True (default: True).
+            job_name (str): Training job name. If not specified, the estimator generates a default job name,
+                based on the training image name and current timestamp.
         """
-        super(AmazonAlgorithmEstimatorBase, self).fit(records.data_channel(), records=records,
-                                                      mini_batch_size=mini_batch_size, **kwargs)
+        self.prepare_for_training(records, job_name=job_name, mini_batch_size=mini_batch_size)
+
+        self.latest_training_job = _TrainingJob.start_new(self, records)
+        if wait:
+            self.latest_training_job.wait(logs=logs)
 
     def record_set(self, train, labels=None, channel="train"):
         """Build a :class:`~RecordSet` from a numpy :class:`~ndarray` matrix and label vector.
@@ -193,8 +213,11 @@ def __repr__(self):
 
     def data_channel(self):
         """Return a dictionary to represent the training data in a channel for use with ``fit()``"""
-        return {self.channel: s3_input(self.s3_data, distribution='ShardedByS3Key',
-                                       s3_data_type=self.s3_data_type)}
+        return {self.channel: self.records_s3_input()}
+
+    def records_s3_input(self):
+        """Return a s3_input to represent the training data"""
+        return s3_input(self.s3_data, distribution='ShardedByS3Key', s3_data_type=self.s3_data_type)
 
 
 def _build_shards(num_shards, array):
diff --git a/src/sagemaker/amazon/pca.py b/src/sagemaker/amazon/pca.py
@@ -102,9 +102,20 @@ def prepare_for_training(self, records, mini_batch_size=None, job_name=None):
             * job_name (str): Name of the training job to be created. If not specified, one is generated,
                 using the base name given to the constructor if applicable.
         """
+        num_records = None
+        if isinstance(records, list):
+            for record in records:
+                if record.channel == 'train':
+                    num_records = record.num_records
+                    break
+            if num_records is None:
+                raise ValueError('Must provide train channel.')
+        else:
+            num_records = records.num_records
+
         # mini_batch_size is a required parameter
         default_mini_batch_size = min(self.DEFAULT_MINI_BATCH_SIZE,
-                                      max(1, int(records.num_records / self.train_instance_count)))
+                                      max(1, int(num_records / self.train_instance_count)))
         use_mini_batch_size = mini_batch_size or default_mini_batch_size
 
         super(PCA, self).prepare_for_training(records=records, mini_batch_size=use_mini_batch_size, job_name=job_name)
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
@@ -144,7 +144,7 @@ def prepare_for_training(self, job_name=None):
             else:
                 self.output_path = 's3://{}/'.format(self.sagemaker_session.default_bucket())
 
-    def fit(self, inputs, wait=True, logs=True, job_name=None, **kwargs):
+    def fit(self, inputs, wait=True, logs=True, job_name=None):
         """Train a model using the input training dataset.
 
         The API calls the Amazon SageMaker CreateTrainingJob API to start model training.
@@ -172,7 +172,7 @@ def fit(self, inputs, wait=True, logs=True, job_name=None, **kwargs):
             job_name (str): Training job name. If not specified, the estimator generates a default job name,
                 based on the training image name and current timestamp.
         """
-        self.prepare_for_training(job_name=job_name, **kwargs)
+        self.prepare_for_training(job_name=job_name)
 
         self.latest_training_job = _TrainingJob.start_new(self, inputs)
         if wait:
diff --git a/src/sagemaker/job.py b/src/sagemaker/job.py
@@ -68,6 +68,11 @@ def _load_config(inputs, estimator):
 
     @staticmethod
     def _format_inputs_to_input_config(inputs):
+        # Circular dependency
+        from sagemaker.amazon.amazon_estimator import RecordSet
+        if isinstance(inputs, RecordSet):
+            inputs = inputs.data_channel()
+
         input_dict = {}
         if isinstance(inputs, string_types):
             input_dict['training'] = _Job._format_string_uri_input(inputs)
@@ -78,6 +83,15 @@ def _format_inputs_to_input_config(inputs):
         elif isinstance(inputs, dict):
             for k, v in inputs.items():
                 input_dict[k] = _Job._format_string_uri_input(v)
+        elif isinstance(inputs, list):
+            for record in inputs:
+                if not isinstance(record, RecordSet):
+                    raise ValueError('List compatible only with RecordSets.')
+
+                if record.channel in input_dict:
+                    raise ValueError('Duplicate channels not allowed.')
+
+                input_dict[record.channel] = record.records_s3_input()
         else:
             raise ValueError(
                 'Cannot format input {}. Expecting one of str, dict or s3_input'.format(inputs))
diff --git a/src/sagemaker/tuner.py b/src/sagemaker/tuner.py
@@ -133,15 +133,14 @@ def fit(self, inputs, job_name=None, **kwargs):
 
         Args:
             inputs (str): Parameters used when called :meth:`~sagemaker.estimator.EstimatorBase.fit`.
-            job_name (str): Job name
+            job_name (str): Tuning job name. If not specified, the tuner generates a default job name,
+                based on the training image name and current timestamp.
             **kwargs: Other arguments
         """
-        # 1P estimators require a RecordSet object
-        if isinstance(inputs, RecordSet):
+        if isinstance(inputs, list) or isinstance(inputs, RecordSet):
             self.estimator.prepare_for_training(inputs, **kwargs)
-            inputs = inputs.data_channel()
         else:
-            self.estimator.prepare_for_training(**kwargs)
+            self.estimator.prepare_for_training(job_name)
 
         self.prepare_for_training(job_name=job_name)
         self.latest_tuning_job = _TuningJob.start_new(self, inputs)
diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py
@@ -46,7 +46,8 @@ def test_fit_1p(sagemaker_session):
     kmeans.half_life_time_size = 1
     kmeans.epochs = 1
 
-    records = kmeans.record_set(train_set[0][:100], channel='test')
+    records = kmeans.record_set(train_set[0][:100])
+    test_records = kmeans.record_set(train_set[0][:100], channel='test')
 
     # specify which hp you want to optimize over
     hyperparameter_ranges = {'extra_center_factor': IntegerParameter(1, 10),
@@ -59,7 +60,7 @@ def test_fit_1p(sagemaker_session):
                                 hyperparameter_ranges=hyperparameter_ranges, objective_type='Minimize', max_jobs=2,
                                 max_parallel_jobs=2)
 
-    tuner.fit(records)
+    tuner.fit([records, test_records])
 
     print('Started HPO job with name:' + tuner.latest_tuning_job.name)
 
diff --git a/tests/unit/test_amazon_estimator.py b/tests/unit/test_amazon_estimator.py
@@ -110,6 +110,31 @@ def test_prepare_for_training():
     assert pca.mini_batch_size == 1
 
 
+def test_prepare_for_training_list():
+    pca = PCA(num_components=55, **COMMON_ARGS)
+
+    train = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 8.0], [44.0, 55.0, 66.0]]
+    labels = [99, 85, 87, 2]
+    records = [pca.record_set(np.array(train), np.array(labels))]
+
+    pca.prepare_for_training(records, mini_batch_size=1)
+    assert pca.feature_dim == 3
+    assert pca.mini_batch_size == 1
+
+
+def test_prepare_for_training_list_no_train_channel():
+    pca = PCA(num_components=55, **COMMON_ARGS)
+
+    train = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 8.0], [44.0, 55.0, 66.0]]
+    labels = [99, 85, 87, 2]
+    records = [pca.record_set(np.array(train), np.array(labels), 'test')]
+
+    with pytest.raises(ValueError) as ex:
+        pca.prepare_for_training(records, mini_batch_size=1)
+
+    assert 'Must provide train channel.' in str(ex)
+
+
 @patch('time.strftime', return_value=TIMESTAMP)
 def test_fit_ndarray(time, sagemaker_session):
     mock_s3 = Mock()
diff --git a/tests/unit/test_job.py b/tests/unit/test_job.py
@@ -15,6 +15,7 @@
 import pytest
 from mock import Mock
 
+from sagemaker.amazon.amazon_estimator import RecordSet
 from sagemaker.estimator import Estimator
 from sagemaker.job import _Job
 from sagemaker.session import s3_input
@@ -86,6 +87,45 @@ def test_format_inputs_to_input_config_dict():
     assert channels[0]['DataSource']['S3DataSource']['S3Uri'] == inputs['train']
 
 
+def test_format_inputs_to_input_config_record_set():
+    inputs = RecordSet(s3_data=BUCKET_NAME, num_records=1, feature_dim=1)
+
+    channels = _Job._format_inputs_to_input_config(inputs)
+
+    assert channels[0]['DataSource']['S3DataSource']['S3Uri'] == inputs.s3_data
+    assert channels[0]['DataSource']['S3DataSource']['S3DataType'] == inputs.s3_data_type
+
+
+def test_format_inputs_to_input_config_list():
+    records = RecordSet(s3_data=BUCKET_NAME, num_records=1, feature_dim=1)
+    inputs = [records]
+
+    channels = _Job._format_inputs_to_input_config(inputs)
+
+    assert channels[0]['DataSource']['S3DataSource']['S3Uri'] == records.s3_data
+    assert channels[0]['DataSource']['S3DataSource']['S3DataType'] == records.s3_data_type
+
+
+def test_format_inputs_to_input_config_list_not_all_records():
+    records = RecordSet(s3_data=BUCKET_NAME, num_records=1, feature_dim=1)
+    inputs = [records, 'mock']
+
+    with pytest.raises(ValueError) as ex:
+        _Job._format_inputs_to_input_config(inputs)
+
+    assert 'List compatible only with RecordSets.' in str(ex)
+
+
+def test_format_inputs_to_input_config_list_duplicate_channel():
+    record = RecordSet(s3_data=BUCKET_NAME, num_records=1, feature_dim=1)
+    inputs = [record, record]
+
+    with pytest.raises(ValueError) as ex:
+        _Job._format_inputs_to_input_config(inputs)
+
+    assert 'Duplicate channels not allowed.' in str(ex)
+
+
 def test_format_input_single_unamed_channel():
     input_dict = _Job._format_inputs_to_input_config('s3://blah/blah')
     assert input_dict == [{
diff --git a/tests/unit/test_tuner.py b/tests/unit/test_tuner.py
@@ -207,14 +207,15 @@ def test_fit_1p(sagemaker_session, tuner):
     tuner._hyperparameter_ranges = hyperparameter_ranges
 
     records = RecordSet(s3_data=INPUTS, num_records=1, feature_dim=1)
-    tuner.fit(records)
+    tuner.fit(records, mini_batch_size=9999)
 
     _, _, tune_kwargs = sagemaker_session.tune.mock_calls[0]
 
     assert len(tune_kwargs['static_hyperparameters']) == 4
     assert tune_kwargs['static_hyperparameters']['extra_components'] == '5'
     assert len(tune_kwargs['parameter_ranges']['IntegerParameterRanges']) == 1
     assert tune_kwargs['job_name'].startswith('pca')
+    assert tuner.estimator.mini_batch_size == 9999
 
 
 def test_attach_tuning_job_with_estimator_from_hyperparameters(sagemaker_session):