Add loss registration to PT by calling hook.register_loss(criterion) (aws#269)

jarednielsen · web-flow · commit 90ef0764d43e · 2019-10-09T11:34:13.000-07:00
* Add loss registration to PT by calling hook.register_loss(criterion)wq

* Change output0 to output_0 for consistencywq

* Hide some overwhelming logging

* Case-sensitive checks

* Fix loss regex
diff --git a/tests/pytorch/test_loss.py b/tests/pytorch/test_loss.py
@@ -0,0 +1,83 @@
+import pytest
+import shutil
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+import tornasole.pytorch as ts
+from tornasole.trials import Trial, create_trial
+
+class Net(nn.Module):
+    """CIFAR-10 classification network structure."""
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+def test_register_loss():
+    """Test that the loss is saved as a tensor."""
+    out_dir = '/tmp/pytorch_test_loss'
+    shutil.rmtree(out_dir, ignore_errors=True)
+
+    net = Net()
+    criterion = nn.CrossEntropyLoss()
+    optimizer = optim.SGD(net.parameters(), lr=0.05, momentum=0.9)
+
+    hook = ts.TornasoleHook(
+        out_dir=out_dir,
+        # With the default SaveConfig, the weights are not saved (only loss/gradient).
+        # The weights tensors will be saved only at the final step, and only if they're a multiple
+        # of save_interval. Issue with flushing?
+        save_config=ts.SaveConfig(save_interval=1),
+    )
+    hook.register_hook(net)
+    hook.register_loss(criterion) # This is the important line
+
+    batch_size = 1
+    n_steps = 5
+    # Use the same data at each step to test loss decreasing
+    inputs, labels = torch.rand(batch_size, 3, 32, 32), torch.zeros(batch_size).long()
+    for _ in range(n_steps):
+        optimizer.zero_grad()
+        outputs = net(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+    #TODO(nieljare): Remove reliance on hook._cleanup()
+    # What if the user has a training loop, then calls the Trials API in the same Python script
+    # (like we do here). Then it'll crash, likewise in a Jupyter notebook.
+    hook._cleanup()
+
+    trial = create_trial(path=out_dir, name='run')
+    loss_coll = hook.collection_manager.get('losses')
+    assert len(loss_coll.get_tensor_names()) == 3
+
+    loss_tensor = trial.tensor('CrossEntropyLoss_output_0')
+    print(f"loss_tensor.steps() = {loss_tensor.steps()}")
+
+    gradient_tensor = trial.tensor('gradient/Net_fc1.weight')
+    print(f"gradient_tensor.steps() = {gradient_tensor.steps()}")
+
+    weight_tensor = trial.tensor('Net_fc1.weight')
+    print(f"weight_tensor.steps() = {weight_tensor.steps()}")
+
+    assert len(trial.available_steps()) == n_steps
+    assert len(weight_tensor.steps()) == n_steps
+    assert len(gradient_tensor.steps()) == n_steps
+    assert len(loss_tensor.steps()) == n_steps
+    assert loss_tensor.value(0) > loss_tensor.value(4)
diff --git a/tests/pytorch/test_simple_write.py b/tests/pytorch/test_simple_write.py
@@ -43,12 +43,12 @@ def __init__(self, mode='weights-bias-gradients', to_save=[]):
             self.saved['relu2_input_0'] = dict()
             self.saved['fc3_input_0'] = dict()
             self.saved['Net_input_0'] = dict()
-            self.saved['fc1_output0'] = dict()
-            self.saved['relu1_output0'] = dict()
-            self.saved['fc2_output0'] = dict()
-            self.saved['relu2_output0'] = dict()
-            self.saved['fc3_output0'] = dict()
-            self.saved['Net_output0'] = dict()
+            self.saved['fc1_output_0'] = dict()
+            self.saved['relu1_output_0'] = dict()
+            self.saved['fc2_output_0'] = dict()
+            self.saved['relu2_output_0'] = dict()
+            self.saved['fc3_output_0'] = dict()
+            self.saved['Net_output_0'] = dict()
 
 
     def forward(self, x_in):
@@ -73,12 +73,12 @@ def forward(self, x_in):
             self.saved['fc3_input_0'][self.step] = relu2_out.data.numpy().copy()
             self.saved['Net_input_0'][self.step] = fc3_out.data.numpy().copy()
 
-            self.saved['fc1_output0'][self.step] = fc1_out.data.numpy().copy()
-            self.saved['relu1_output0'][self.step] = relu1_out.data.numpy().copy()
-            self.saved['fc2_output0'][self.step] = fc2_out.data.numpy().copy()
-            self.saved['relu2_output0'][self.step] = relu2_out.data.numpy().copy()
-            self.saved['fc3_output0'][self.step] = fc3_out.data.numpy().copy()
-            self.saved['Net_output0'][self.step] = out.data.numpy().copy()
+            self.saved['fc1_output_0'][self.step] = fc1_out.data.numpy().copy()
+            self.saved['relu1_output_0'][self.step] = relu1_out.data.numpy().copy()
+            self.saved['fc2_output_0'][self.step] = fc2_out.data.numpy().copy()
+            self.saved['relu2_output_0'][self.step] = relu2_out.data.numpy().copy()
+            self.saved['fc3_output_0'][self.step] = fc3_out.data.numpy().copy()
+            self.saved['Net_output_0'][self.step] = out.data.numpy().copy()
         return out
 
 # Create a tornasole hook. The initilization of hook determines which tensors
@@ -202,7 +202,7 @@ def saveall_test_helper(hook=None):
     weights = ['Net_fc1.weight', 'Net_fc2.weight', 'Net_fc3.weight']
     bias = ['Net_fc1.bias', 'Net_fc2.bias', 'Net_fc3.bias']
     inputs = ['fc1_input_0', 'relu1_input_0', 'fc2_input_0', 'relu2_input_0', 'fc3_input_0']
-    outputs = ['fc1_output0', 'relu1_output0', 'fc2_output0', 'relu2_output0', 'fc3_output0']
+    outputs = ['fc1_output_0', 'relu1_output_0', 'fc2_output_0', 'relu2_output_0', 'fc3_output_0']
     tensors = grads + bias + weights + inputs + outputs
 
     assert len(trial.available_steps()) == len(save_steps)
@@ -237,7 +237,7 @@ def helper_test_multi_collections(hook, out_dir):
     weights = ['Net_fc1.weight', 'Net_fc2.weight', 'Net_fc3.weight']
     bias = ['Net_fc1.bias', 'Net_fc2.bias', 'Net_fc3.bias']
     inputs = ['fc1_input_0', 'relu1_input_0', 'relu2_input_0']
-    outputs = ['fc1_output0', 'relu1_output0', 'relu2_output0']
+    outputs = ['fc1_output_0', 'relu1_output_0', 'relu2_output_0']
     tensors = grads + bias + weights + inputs + outputs
 
     assert len(trial.available_steps()) == len(save_steps)
diff --git a/tornasole/core/hook.py b/tornasole/core/hook.py
@@ -248,7 +248,7 @@ class CallbackHook(BaseHook):
     __metaclass__ = ABCMeta
     INVALID_TAG_CHARACTERS = _re.compile(r'[^-/\w\.]')
     INPUT_TENSOR_SUFFIX = '_input_'
-    OUTPUT_TENSOR_SUFFIX = '_output'
+    OUTPUT_TENSOR_SUFFIX = '_output_'
     GRADIENT_PREFIX = 'gradient/'
 
     def __init__(self,
diff --git a/tornasole/core/utils.py b/tornasole/core/utils.py
@@ -108,6 +108,7 @@ def get_worker_name_from_collection_file(filename):
     return re.match(worker_name_regex, filename).group(1)
 
 def match_inc(tname, include):
+    """Matches anywhere in the string, doesn't require full match."""
     for inc in include:
         if re.search(inc, tname):
             return True
diff --git a/tornasole/mxnet/hook.py b/tornasole/mxnet/hook.py
@@ -107,7 +107,8 @@ def forward_hook(self, block, inputs, outputs):
             return
 
         block_name = block.name
-        logger.debug("Processing the global step {0} for block {1}".format(self.step, block_name))
+        # This overwhelms the logs; turn back on if you really need it
+        # logger.debug("Processing the global step {0} for block {1}".format(self.step, block_name))
 
         # Output input tensor
         self._write_inputs(block_name, inputs)
diff --git a/tornasole/pytorch/hook.py b/tornasole/pytorch/hook.py
@@ -14,7 +14,8 @@
     CollectionKeys.WEIGHTS,
     CollectionKeys.BIASES,
     CollectionKeys.GRADIENTS,
-    CollectionKeys.DEFAULT
+    CollectionKeys.DEFAULT,
+    CollectionKeys.LOSSES,
 ]
 
 
@@ -68,9 +69,9 @@ def log_params(self, module):
         params = module.named_parameters()
         for name, param in params:
             pname = module_name + '_' + name
-            self.logger.debug(
-                "Processing the global step {0} for parameter {1}".format(
-                    self.step, pname))
+            # This overwhelms the logs; turn back on if you really need it
+            # self.logger.debug(
+                # "Processing the global step {0} for parameter {1}".format(self.step, pname))
             self._write_tensor(tensor_name=pname, tensor_value=param.data)
 
     # This hook is invoked by trainer prior to running the forward pass.
@@ -93,15 +94,16 @@ def forward_pre_hook(self, module, inputs):
         if self.last_saved_step is not None and not self.exported_collections:
             self.export_collections()
             self.exported_collections = True
-            
+
     # This hook is invoked by trainer after running the forward pass.
     def forward_hook(self, module, inputs, outputs):
         if self.collections_in_this_step is None:
             logging.debug("Skipping the global step {0}".format(self.step))
             return
 
         module_name = self.module_maps[module]
-        logger.debug("Processing the global step {0} for module {1}".format(self.step, module_name))
+        # This overwhelms the logs; turn back on if you really need it
+        # logger.debug("Processing the global step {0} for module {1}".format(self.step, module_name))
 
         # Output input tensor
         self._write_inputs(module_name, inputs)
@@ -119,39 +121,61 @@ def back(grad):
                     self._write_tensor(tensor_name=self.GRADIENT_PREFIX + tname, tensor_value=grad)
         return back
 
-    def _recursive_apply(self, module):
-        """
-        This function is "applied" to every child in the block. This function in turn
-        registers the forward hook to each module. It helps logging the input output tensors
-        of that module.
-        """
-        module.register_forward_hook(self.forward_hook)
-
     def _backward_apply(self, module):
+        """Apply the function `self.backward_hook` as a callback to each parameter in `module.
+
+        This will capture the gradients.
+        """
         params = module.named_parameters()
         for name, param in params:
             pname = module._get_name() + '_' + name
             param.register_hook(self.backward_hook(pname))
 
+    def closure_for_registering_forward_hook(self, module):
+        """Lambda functions don't work here."""
+        module.register_forward_hook(self.forward_hook)
+
     def register_hook(self, module):
         """
         This function registers the forward hook. If user wants to register the hook
         for every child in the given block, then the function calls "apply" API for
         registration of the hook.
-        The hook is registered recursively for all blocks
+        The hook is registered recursively for all blocks.
         """
+        # Typechecking
         if not isinstance(module, torch.nn.Module):
-            logger.error("The given module type {0} is not currently supported by Tornasole Hook".format(
-                module.__class__.__name__))
-            return
-        module.register_forward_pre_hook(self.forward_pre_hook)
+            raise ValueError(f"Module type {module.__class__.__name__} must be type torch.nn.Module")
 
-        for layer in list(module.named_modules()):
-            self.module_maps[layer[1]] = layer[0]
+        # Create a mapping from modules to their names
+        for name, submodule in module.named_modules():
+            assert submodule not in self.module_maps, f"Don't register module={module} twice"
+            self.module_maps[submodule] = name
         self.module_maps[module] = module._get_name()
-        module.apply(self._recursive_apply)
+
+        # Use `forward_pre_hook` for the entire net
+        module.register_forward_pre_hook(self.forward_pre_hook)
+
+        # Set `self.forward_hook` as a callback for each submodule/layer.
+        # `module.apply(fn)` calls fn for each submodule in module.children()
+        module.apply(self.closure_for_registering_forward_hook)
+
+        # Capture the gradient for each parameter in the net
         self._backward_apply(module)
 
+    def register_loss(self, loss_module):
+        """Register something like `criterion = nn.CrossEntropyLoss()`."""
+        # Typechecking
+        assert isinstance(loss_module, torch.nn.modules.loss._Loss), (
+            f"loss_module={loss_module} must be subclass of `torch.nn.modules.loss._Loss`, "
+            f"but has class hierarchy {type.mro(type(loss_module))}"
+        )
+        # Register the module in self.module_maps
+        name = loss_module._get_name()
+        self.module_maps[loss_module] = name
+        # Add a callback to the forward pass
+        loss_module.register_forward_hook(self.forward_hook)
+
+
     @staticmethod
     def _get_reduction_of_data(reduction_name, tensor_value, tensor_name, abs):
         return get_reduction_of_data(reduction_name, tensor_value, tensor_name, abs)
diff --git a/tornasole/pytorch/torch_collection.py b/tornasole/pytorch/torch_collection.py
@@ -24,7 +24,7 @@ def _register_default_collections(self):
         self.get(CollectionKeys.WEIGHTS).include('^(?!gradient).*weight')
         self.get(CollectionKeys.BIASES).include('^(?!gradient).*bias')
         self.get(CollectionKeys.GRADIENTS).include('^gradient')
-        self.get(CollectionKeys.LOSSES).include('.*loss')
+        self.get(CollectionKeys.LOSSES).include('Loss')
 
     def create_collection(self, name):
         super().create_collection(name, cls=Collection)