Support for spot training. (aws#303)

leleamol · web-flow · commit 0176605c358d · 2019-11-05T15:37:07.000-08:00
* Support for spot training.

* Updated the comment

* Fixing the CI build for pre-commit failures.

* Fixing the CI.

* Adding the test file for testing spot training

* Support for spot training. Addressed the review comments. Added the test script.

* Added the log statement as per review comment.

* Fixing the test to run correctly in CI

* Emit the end of training file only if the job is not running under SageMaker

* Reorganized code for better readability.

* Updated the implementation to avoid global variables.

* Addressed the review comments

* Addressed the review comments to refactor the code.

* Updated the checkpoint timestamp to look for all the modified files in the directory.

* Avoided one disk access to compute timestamp
diff --git a/tests/mxnet/test_json_configs/checkpointconfig.json b/tests/mxnet/test_json_configs/checkpointconfig.json
@@ -0,0 +1,3 @@
+{
+  "LocalPath" : "./savedParams"
+}
diff --git a/tests/mxnet/test_spot_training.py b/tests/mxnet/test_spot_training.py
@@ -0,0 +1,216 @@
+# Using batch size 4 instead of 1024 decreases runtime from 35 secs to 4 secs.
+
+from mxnet import gluon, init, autograd
+from mxnet.gluon import nn
+from mxnet.gluon.data.vision import datasets, transforms
+import time
+import mxnet as mx
+from tornasole import modes
+from tornasole.mxnet.hook import TornasoleHook as t_hook
+from tornasole import SaveConfig
+from tornasole.mxnet import reset_collections
+from tornasole.core.access_layer.utils import has_training_ended
+from tornasole.core.config_constants import CHECKPOINT_CONFIG_FILE_PATH_ENV_VAR
+from tornasole.trials import create_trial
+from datetime import datetime
+
+import shutil
+import os
+
+
+def acc(output, label):
+    return (output.argmax(axis=1) == label.astype("float32")).mean().asscalar()
+
+
+def run_mnist(
+    hook=None,
+    set_modes=False,
+    num_steps_train=None,
+    num_steps_eval=None,
+    epochs=2,
+    save_interval=None,
+    save_path="./saveParams",
+):
+    batch_size = 4
+    normalize_mean = 0.13
+    mnist_train = datasets.FashionMNIST(train=True)
+
+    X, y = mnist_train[0]
+    ("X shape: ", X.shape, "X dtype", X.dtype, "y:", y)
+
+    text_labels = [
+        "t-shirt",
+        "trouser",
+        "pullover",
+        "dress",
+        "coat",
+        "sandal",
+        "shirt",
+        "sneaker",
+        "bag",
+        "ankle boot",
+    ]
+    transformer = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize(normalize_mean, 0.31)]
+    )
+
+    mnist_train = mnist_train.transform_first(transformer)
+    mnist_valid = gluon.data.vision.FashionMNIST(train=False)
+
+    train_data = gluon.data.DataLoader(
+        mnist_train, batch_size=batch_size, shuffle=True, num_workers=4
+    )
+    valid_data = gluon.data.DataLoader(
+        mnist_valid.transform_first(transformer), batch_size=batch_size, num_workers=4
+    )
+
+    # Create Model in Gluon
+    net = nn.HybridSequential()
+    net.add(
+        nn.Conv2D(channels=6, kernel_size=5, activation="relu"),
+        nn.MaxPool2D(pool_size=2, strides=2),
+        nn.Conv2D(channels=16, kernel_size=3, activation="relu"),
+        nn.MaxPool2D(pool_size=2, strides=2),
+        nn.Flatten(),
+        nn.Dense(120, activation="relu"),
+        nn.Dense(84, activation="relu"),
+        nn.Dense(10),
+    )
+    net.initialize(init=init.Xavier(), ctx=mx.cpu())
+
+    if hook is not None:
+        # Register the forward Hook
+        hook.register_hook(net)
+
+    softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
+    trainer = gluon.Trainer(net.collect_params(), "sgd", {"learning_rate": 0.1})
+    hook.register_hook(softmax_cross_entropy)
+
+    # Start the training.
+    for epoch in range(epochs):
+        train_loss, train_acc, valid_acc = 0.0, 0.0, 0.0
+        tic = time.time()
+        if set_modes:
+            hook.set_mode(modes.TRAIN)
+
+        i = 0
+        for data, label in train_data:
+            data = data.as_in_context(mx.cpu(0))
+            # forward + backward
+            with autograd.record():
+                output = net(data)
+                loss = softmax_cross_entropy(output, label)
+            loss.backward()
+            # update parameters
+            trainer.step(batch_size)
+            # calculate training metrics
+            train_loss += loss.mean().asscalar()
+            train_acc += acc(output, label)
+            i += 1
+            if num_steps_train is not None and i >= num_steps_train:
+                break
+        # calculate validation accuracy
+        if set_modes:
+            hook.set_mode(modes.EVAL)
+        i = 0
+        for data, label in valid_data:
+            data = data.as_in_context(mx.cpu(0))
+            val_output = net(data)
+            valid_acc += acc(val_output, label)
+            loss = softmax_cross_entropy(val_output, label)
+            i += 1
+            if num_steps_eval is not None and i >= num_steps_eval:
+                break
+        print(
+            "Epoch %d: loss %.3f, train acc %.3f, test acc %.3f, in %.1f sec"
+            % (
+                epoch,
+                train_loss / len(train_data),
+                train_acc / len(train_data),
+                valid_acc / len(valid_data),
+                time.time() - tic,
+            )
+        )
+        if save_interval is not None and (epoch % save_interval) == 0:
+            net.save_parameters("{0}/params_{1}.params".format(save_path, epoch))
+
+
+def test_spot_hook():
+    reset_collections()
+    os.environ[
+        CHECKPOINT_CONFIG_FILE_PATH_ENV_VAR
+    ] = "./tests/mxnet/test_json_configs/checkpointconfig.json"
+    checkpoint_path = "./savedParams"
+    if not os.path.exists(checkpoint_path):
+        os.mkdir(checkpoint_path)
+    save_config = SaveConfig(save_steps=[10, 11, 12, 13, 14, 40, 50, 60, 70, 80])
+
+    """
+    Run the training for 2 epochs and save the parameter after every epoch.
+    We expect that steps 0 to 14 will be written.
+    """
+
+    run_id_1 = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
+    out_dir_1 = "newlogsRunTest/" + run_id_1
+    hook = t_hook(
+        out_dir=out_dir_1, save_config=save_config, include_collections=["weights", "gradients"]
+    )
+    assert has_training_ended(out_dir_1) == False
+    run_mnist(
+        hook=hook,
+        num_steps_train=10,
+        num_steps_eval=10,
+        epochs=2,
+        save_interval=1,
+        save_path=checkpoint_path,
+    )
+
+    """
+    Run the training again for 4 epochs and save the parameter after every epoch.
+    We DONOT expect that steps 0 to 14 are written.
+    We expect to read steps 40, 50, 60, 70 and 80
+    """
+    run_id_2 = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
+    out_dir_2 = "newlogsRunTest/" + run_id_2
+    hook = t_hook(
+        out_dir=out_dir_2, save_config=save_config, include_collections=["weights", "gradients"]
+    )
+    assert has_training_ended(out_dir_2) == False
+    run_mnist(
+        hook=hook,
+        num_steps_train=10,
+        num_steps_eval=10,
+        epochs=4,
+        save_interval=1,
+        save_path=checkpoint_path,
+    )
+    # Unset the environ variable before validation so that it won't affect the other scripts in py test environment.
+    del os.environ[CHECKPOINT_CONFIG_FILE_PATH_ENV_VAR]
+
+    # Validation
+    print("Created the trial with out_dir {0} for the first training".format(out_dir_1))
+    tr = create_trial(out_dir_1)
+    assert tr
+    available_steps_1 = tr.available_steps()
+    assert 40 not in available_steps_1
+    assert 80 not in available_steps_1
+    print(available_steps_1)
+
+    print("Created the trial with out_dir {0} for the second training".format(out_dir_2))
+    tr = create_trial(out_dir_2)
+    assert tr
+    available_steps_2 = tr.available_steps()
+    assert 40 in available_steps_2
+    assert 50 in available_steps_2
+    assert 60 in available_steps_2
+    assert 70 in available_steps_2
+    assert 80 in available_steps_2
+    assert 0 not in available_steps_2
+    assert 10 not in available_steps_2
+    assert 11 not in available_steps_2
+    assert 12 not in available_steps_2
+    print(available_steps_2)
+
+    print("Cleaning up.")
+    shutil.rmtree(os.path.dirname(out_dir_1))
+    shutil.rmtree(checkpoint_path)
diff --git a/tornasole/core/access_layer/utils.py b/tornasole/core/access_layer/utils.py
@@ -5,14 +5,21 @@
 from tornasole.core.utils import is_s3, get_region
 from tornasole.core.logger import get_logger
 from tornasole.core.access_layer.s3handler import S3Handler, ListRequest
+from tornasole.core.sagemaker_utils import is_sagemaker_job
 import asyncio
 import aioboto3
 
-END_OF_JOB_FILENAME = "END_OF_JOB.ts"
+END_OF_JOB_FILENAME = "training_job_end.ts"
 logger = get_logger()
 
 
 def training_has_ended(trial_prefix):
+    # Emit the end of training file only if the job is not running under SageMaker.
+    if is_sagemaker_job():
+        logger.info(
+            f"The end of training job file will not be written for jobs running under SageMaker."
+        )
+        return
     try:
         check_dir_exists(trial_prefix)
         # if path does not exist, then we don't need to write a file
diff --git a/tornasole/core/config_constants.py b/tornasole/core/config_constants.py
@@ -14,3 +14,11 @@
 TORNASOLE_CONFIG_SAVE_ALL_KEY = "save_all"
 DEFAULT_SAGEMAKER_TORNASOLE_PATH = "/opt/ml/output/tensors"
 TORNASOLE_DEFAULT_COLLECTIONS_FILE_NAME = "worker_0_collections.json"
+CHECKPOINT_CONFIG_FILE_PATH_ENV_VAR = "CHECKPOINT_CONFIG_FILE_PATH"
+CHECKPOINT_DIR_KEY = "LocalPath"
+DEFAULT_CHECKPOINT_CONFIG_FILE = "/opt/ml/input/config/checkpointconfig.json"
+TORNASOLE_META_DATA_FILE = "TornasoleMetadata.json"
+LATEST_GLOBAL_STEP_SEEN = "latest-global-step-seen"
+LATEST_GLOBAL_STEP_SAVED = "latest-global-step-saved"
+LATEST_MODE_STEP = "latest-mode-step"
+TRAINING_RUN = "training-run"
diff --git a/tornasole/core/hook.py b/tornasole/core/hook.py
@@ -21,6 +21,14 @@
 from tornasole.core.logger import get_logger
 from tornasole.core.reductions import get_reduction_tensor_name
 from tornasole.core.writer import FileWriter
+from tornasole.core.state_store import StateStore
+from tornasole.core.config_constants import (
+    TRAINING_RUN,
+    LATEST_GLOBAL_STEP_SAVED,
+    LATEST_GLOBAL_STEP_SEEN,
+    LATEST_MODE_STEP,
+)
+
 
 logger = get_logger()
 
@@ -126,13 +134,33 @@ def __init__(
         self.prepared_collections = False
         self.tensor_to_collections = {}
         self.step = init_step
+        self.last_saved_step = None
         self.mode = ModeKeys.GLOBAL
         self.mode_steps = {ModeKeys.GLOBAL: init_step}
         self.writer = None
         self.tb_writers = {}
         self.logger.info("Saving to {}".format(self.out_dir))
         atexit.register(self._cleanup)
 
+        # Check if there is any last saved tornasole state. Initialize the hook based last saved state.
+        self.training_run = 0
+        self._initialize_to_last_saved_state()
+
+    def _initialize_to_last_saved_state(self):
+        self.state_store = StateStore()
+        last_tornasole_state = self.state_store.get_last_saved_tornasole_state()
+        if last_tornasole_state is not None:
+            self.last_saved_step = last_tornasole_state[LATEST_GLOBAL_STEP_SAVED]
+            self.init_step = last_tornasole_state[LATEST_GLOBAL_STEP_SEEN]
+            self.training_run = 1 + last_tornasole_state[TRAINING_RUN]
+            for (mode, step) in last_tornasole_state[LATEST_MODE_STEP].items():
+                self.mode_steps[ModeKeys[mode]] = step
+            self.mode_steps[ModeKeys.GLOBAL] = self.init_step
+            self.step = self.init_step
+            self.logger.info(
+                f"Initialized the hook with the last saved state: last_saved_step={self.last_saved_step} init_step = {self.init_step}, step = {self.step} mode_steps = {str(self.mode_steps)}"
+            )
+
     def __repr__(self):
         return (
             f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}>:(\n"
@@ -309,10 +337,25 @@ def _cleanup(self):
         training_has_ended(self.out_dir)
 
     def _increment_step(self):
+        # Update the last_tornasole_state to the last step number that was saved or seen
+        self._write_tornasole_state()
+
         self.step += 1
         self.mode_steps[self.mode] += 1
         self._collections_to_save_for_step = None
 
+    def _write_tornasole_state(self):
+        if self.state_store.is_checkpoint_updated():
+            current_tornasole_state = dict()
+            current_tornasole_state[TRAINING_RUN] = self.training_run
+            current_tornasole_state[LATEST_GLOBAL_STEP_SAVED] = self.last_saved_step
+            current_tornasole_state[LATEST_GLOBAL_STEP_SEEN] = self.step
+            mode_step = dict()
+            for (mode, step) in self.mode_steps.items():
+                mode_step[mode.name] = step
+            current_tornasole_state[LATEST_MODE_STEP] = mode_step
+            self.state_store.update_tornasole_state(current_tornasole_state)
+
     def set_mode(self, mode):
         # train
         if mode in ALLOWED_MODES:
@@ -521,7 +564,6 @@ def __init__(
             include_collections=include_collections,
             save_all=save_all,
         )
-        self.last_saved_step = None
         self.exported_collections = False
         self.data_type_name = data_type_name
 
diff --git a/tornasole/core/state_store.py b/tornasole/core/state_store.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{`
	`2`	`+ "LocalPath" : "./savedParams"`
	`3`	`+}`