Refactor EstimatorBase and Framework to have a prepare_for_training() method (aws#15)

laurenyu · web-flow · commit 453b6a835d18 · 2018-05-21T09:23:43.000-07:00
* Refactor EstimatorBase and Framework to have a prepare_for_training() method

* Specify argument directly instead of using **kwargs
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
@@ -120,6 +120,29 @@ def hyperparameters(self):
         """
         pass
 
+    def prepare_for_training(self, job_name=None):
+        """Set any values in the estimator that need to be set before training.
+
+        Args:
+            * job_name (str): Name of the training job to be created. If not specified, one is generated,
+                using the base name given to the constructor if applicable.
+        """
+        if job_name is not None:
+            self._current_job_name = job_name
+        else:
+            # honor supplied base_job_name or generate it
+            base_name = self.base_job_name or base_name_from_image(self.train_image())
+            self._current_job_name = name_from_base(base_name)
+
+        # if output_path was specified we use it otherwise initialize here.
+        # For Local Mode with local_code=True we don't need an explicit output_path
+        if self.output_path is None:
+            local_code = get_config_value('local.local_code', self.sagemaker_session.config)
+            if self.sagemaker_session.local_mode and local_code:
+                self.output_path = ''
+            else:
+                self.output_path = 's3://{}/'.format(self.sagemaker_session.default_bucket())
+
     def fit(self, inputs, wait=True, logs=True, job_name=None):
         """Train a model using the input training dataset.
 
@@ -148,22 +171,7 @@ def fit(self, inputs, wait=True, logs=True, job_name=None):
             job_name (str): Training job name. If not specified, the estimator generates a default job name,
                 based on the training image name and current timestamp.
         """
-
-        if job_name is not None:
-            self._current_job_name = job_name
-        else:
-            # make sure the job name is unique for each invocation, honor supplied base_job_name or generate it
-            base_name = self.base_job_name or base_name_from_image(self.train_image())
-            self._current_job_name = name_from_base(base_name)
-
-        # if output_path was specified we use it otherwise initialize here.
-        # For Local Mode with local_code=True we don't need an explicit output_path
-        if self.output_path is None:
-            local_code = get_config_value('local.local_code', self.sagemaker_session.config)
-            if self.sagemaker_session.local_mode and local_code:
-                self.output_path = ''
-            else:
-                self.output_path = 's3://{}/'.format(self.sagemaker_session.default_bucket())
+        self.prepare_for_training(job_name=job_name)
 
         self.latest_training_job = _TrainingJob.start_new(self, inputs)
         if wait:
@@ -505,39 +513,14 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, enable_cl
         self._hyperparameters = hyperparameters or {}
         self.code_location = code_location
 
-    def fit(self, inputs, wait=True, logs=True, job_name=None):
-        """Train a model using the input training dataset.
-
-        The API calls the Amazon SageMaker CreateTrainingJob API to start model training.
-        The API uses configuration you provided to create the estimator and the
-        specified input training data to send the CreatingTrainingJob request to Amazon SageMaker.
-
-        This is a synchronous operation. After the model training successfully completes,
-        you can call the ``deploy()`` method to host the model using the Amazon SageMaker hosting services.
+    def prepare_for_training(self, job_name=None):
+        """Set hyperparameters needed for training. This method will also validate ``source_dir``.
 
         Args:
-            inputs (str or dict or sagemaker.session.s3_input): Information about the training data.
-                This can be one of three types:
-                (str) - the S3 location where training data is saved.
-                (dict[str, str] or dict[str, sagemaker.session.s3_input]) - If using multiple channels for
-                    training data, you can specify a dict mapping channel names
-                    to strings or :func:`~sagemaker.session.s3_input` objects.
-                (sagemaker.session.s3_input) - channel configuration for S3 data sources that can provide
-                    additional information about the training dataset. See :func:`sagemaker.session.s3_input`
-                    for full details.
-            wait (bool): Whether the call shouldl wait until the job completes (default: True).
-            logs (bool): Whether to show the logs produced by the job.
-                Only meaningful when wait is True (default: True).
-            job_name (str): Training job name. If not specified, the estimator generates a default job name,
-                based on the training image name and current timestamp.
+            * job_name (str): Name of the training job to be created. If not specified, one is generated,
+                using the base name given to the constructor if applicable.
         """
-        # always determine new job name _here_ because it is used before base is called
-        if job_name is not None:
-            self._current_job_name = job_name
-        else:
-            # honor supplied base_job_name or generate it
-            base_name = self.base_job_name or base_name_from_image(self.train_image())
-            self._current_job_name = name_from_base(base_name)
+        super(Framework, self).prepare_for_training(job_name=job_name)
 
         # validate source dir will raise a ValueError if there is something wrong with the
         # source directory. We are intentionally not handling it because this is a critical error.
@@ -567,7 +550,6 @@ def fit(self, inputs, wait=True, logs=True, job_name=None):
         self._hyperparameters[CONTAINER_LOG_LEVEL_PARAM_NAME] = self.container_log_level
         self._hyperparameters[JOB_NAME_PARAM_NAME] = self._current_job_name
         self._hyperparameters[SAGEMAKER_REGION_PARAM_NAME] = self.sagemaker_session.boto_region_name
-        super(Framework, self).fit(inputs, wait, logs, self._current_job_name)
 
     def _stage_user_code_in_s3(self):
         """ Upload the user training script to s3 and return the location.
diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py
@@ -15,6 +15,7 @@
 import logging
 import json
 import os
+
 import pytest
 from mock import Mock, patch
 
@@ -39,20 +40,22 @@
 REGION = 'us-west-2'
 JOB_NAME = '{}-{}'.format(IMAGE_NAME, TIMESTAMP)
 
-COMMON_TRAIN_ARGS = {'volume_size': 30,
-                     'hyperparameters': {
-                         'sagemaker_program': 'dummy_script.py',
-                         'sagemaker_enable_cloudwatch_metrics': False,
-                         'sagemaker_container_log_level': logging.INFO,
-                     },
-                     'input_mode': 'File',
-                     'instance_type': 'c4.4xlarge',
-                     'inputs': 's3://mybucket/train',
-                     'instance_count': 1,
-                     'role': 'DummyRole',
-                     'kms_key_id': None,
-                     'max_run': 24,
-                     'wait': True}
+COMMON_TRAIN_ARGS = {
+    'volume_size': 30,
+    'hyperparameters': {
+        'sagemaker_program': 'dummy_script.py',
+        'sagemaker_enable_cloudwatch_metrics': False,
+        'sagemaker_container_log_level': logging.INFO,
+    },
+    'input_mode': 'File',
+    'instance_type': 'c4.4xlarge',
+    'inputs': 's3://mybucket/train',
+    'instance_count': 1,
+    'role': 'DummyRole',
+    'kms_key_id': None,
+    'max_run': 24,
+    'wait': True,
+}
 
 DESCRIBE_TRAINING_JOB_RESULT = {
     'ModelArtifacts': {
@@ -275,19 +278,6 @@ def test_attach_framework(sagemaker_session):
     assert framework_estimator.entry_point == 'iris-dnn-classifier.py'
 
 
-def test_fit_then_fit_again(sagemaker_session):
-    fw = DummyFramework(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
-                        train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
-                        enable_cloudwatch_metrics=True)
-    fw.fit(inputs=s3_input('s3://mybucket/train'))
-    first_job_name = fw.latest_training_job.name
-
-    fw.fit(inputs=s3_input('s3://mybucket/train2'))
-    second_job_name = fw.latest_training_job.name
-
-    assert first_job_name != second_job_name
-
-
 @patch('time.strftime', return_value=TIMESTAMP)
 def test_fit_verify_job_name(strftime, sagemaker_session):
     fw = DummyFramework(entry_point=SCRIPT_PATH, role='DummyRole', sagemaker_session=sagemaker_session,
@@ -304,42 +294,55 @@ def test_fit_verify_job_name(strftime, sagemaker_session):
     assert fw.latest_training_job.name == JOB_NAME
 
 
-def test_fit_force_name(sagemaker_session):
+def test_prepare_for_training_unique_job_name_generation(sagemaker_session):
+    fw = DummyFramework(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
+                        train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
+                        enable_cloudwatch_metrics=True)
+    fw.prepare_for_training()
+    first_job_name = fw._current_job_name
+
+    fw.prepare_for_training()
+    second_job_name = fw._current_job_name
+
+    assert first_job_name != second_job_name
+
+
+def test_prepare_for_training_force_name(sagemaker_session):
     fw = DummyFramework(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                         train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                         base_job_name='some', enable_cloudwatch_metrics=True)
-    fw.fit(inputs=s3_input('s3://mybucket/train'), job_name='use_it')
-    assert 'use_it' == fw.latest_training_job.name
+    fw.prepare_for_training(job_name='use_it')
+    assert 'use_it' == fw._current_job_name
 
 
 @patch('time.strftime', return_value=TIMESTAMP)
-def test_fit_force_generation(strftime, sagemaker_session):
+def test_prepare_for_training_force_name_generation(strftime, sagemaker_session):
     fw = DummyFramework(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                         train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                         base_job_name='some', enable_cloudwatch_metrics=True)
     fw.base_job_name = None
-    fw.fit(inputs=s3_input('s3://mybucket/train'))
-    assert JOB_NAME == fw.latest_training_job.name
+    fw.prepare_for_training()
+    assert JOB_NAME == fw._current_job_name
 
 
 @patch('time.strftime', return_value=TIMESTAMP)
 def test_init_with_source_dir_s3(strftime, sagemaker_session):
-    uri = 'bucket/mydata'
-
     fw = DummyFramework(entry_point=SCRIPT_PATH, source_dir='s3://location', role=ROLE,
                         sagemaker_session=sagemaker_session,
                         train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                         enable_cloudwatch_metrics=False)
-    fw.fit('s3://{}'.format(uri))
-
-    expected_hyperparameters = BASE_HP.copy()
-    expected_hyperparameters['sagemaker_enable_cloudwatch_metrics'] = 'false'
-    expected_hyperparameters['sagemaker_container_log_level'] = str(logging.INFO)
-    expected_hyperparameters['sagemaker_submit_directory'] = json.dumps("s3://location")
-    expected_hyperparameters['sagemaker_region'] = '"us-west-2"'
-
-    actual_hyperparameter = sagemaker_session.method_calls[1][2]['hyperparameters']
-    assert actual_hyperparameter == expected_hyperparameters
+    fw.prepare_for_training()
+
+    expected_hyperparameters = {
+        'sagemaker_program': SCRIPT_NAME,
+        'sagemaker_submit_directory': 's3://mybucket/{}/source/sourcedir.tar.gz'.format(JOB_NAME),
+        'sagemaker_job_name': JOB_NAME,
+        'sagemaker_enable_cloudwatch_metrics': False,
+        'sagemaker_container_log_level': logging.INFO,
+        'sagemaker_submit_directory': 's3://location',
+        'sagemaker_region': 'us-west-2',
+    }
+    assert fw._hyperparameters == expected_hyperparameters
 
 
 # _TrainingJob 'utils'