PyTorch Distributed Training Support & Test (aws#272)

jarednielsen · web-flow · commit e7e9b4655e13 · 2019-10-09T16:56:11.000-07:00
* Initial commit

* Add test for PT-DT

* Rename file

* Remove extraneous file

* Add function back

* Add check for horovod.torch

* Fix f-string

* catch ImportError

* Remove try-catch

* Address Rahul's comment

* Check that torch.distributed is available

* Add trial.workers() check

* Wrap race condition

* Add reset_collections() to test cases

* peace offering to CI
diff --git a/tests/pytorch/test_distributed_training.py b/tests/pytorch/test_distributed_training.py
@@ -0,0 +1,157 @@
+"""
+Tests core functionality of naming workers when there are multiple processes.
+See https://pytorch.org/tutorials/intermediate/ddp_tutorial.html to decide
+how we want to support DistributedDataParallel with limited user configuration.
+
+The key methods are
+    torch.distributed.get_rank() - when manually spawning processes
+"""
+import numpy as nn
+import os
+import torch
+import torch.distributed as dist
+from torch.multiprocessing import Process
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel as DDP
+import torch.nn.functional as F
+import torch.optim as optim
+import shutil
+
+
+import tornasole.pytorch as ts
+from tornasole.trials import Trial, create_trial
+
+out_dir = '/tmp/run'
+
+class Net(nn.Module):
+    """Returns f(x) = sigmoid(w*x + b)"""
+    def __init__(self):
+        super().__init__()
+        self.add_module('fc', nn.Linear(1, 1))
+
+    def forward(self, x):
+        x = self.fc(x)
+        x = F.sigmoid(x)
+        return x
+
+def dataset(batch_size=4):
+    """Return a dataset of (data, target)."""
+    data = torch.rand(batch_size, 1)
+    target = F.sigmoid(2 * data + 1)
+    return data, target
+
+def train(model, device, optimizer, num_steps=10):
+    """Runs the training loop, no explicit Tornasole here."""
+    model.train()
+    for i in range(num_steps):
+        batch_size = 4
+        data = torch.rand(batch_size, 1)
+        target = F.sigmoid(2 * data + 1)
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.mse_loss(output, target)
+        loss.backward()
+        optimizer.step()
+
+
+
+def run(rank, size, num_epochs=10, batch_size=128, num_batches=10):
+    """Distributed function to be implemented later."""
+    torch.manual_seed(1234)
+    device = torch.device('cpu')
+    model = Net().to(device)
+    optimizer = optim.SGD(model.parameters(), lr=1)
+
+    shutil.rmtree(out_dir, ignore_errors=True)
+    hook = ts.TornasoleHook(
+        out_dir=out_dir,
+        save_config=ts.SaveConfig(save_steps=[0, 1, 5]),
+        save_all=True,
+    )
+    hook.register_hook(model)
+
+    for epoch in range(num_epochs):
+        epoch_loss = 0.0
+        for _ in range(num_batches):
+            optimizer.zero_grad()
+            data, target = dataset(batch_size)
+            output = model(data)
+            loss = F.mse_loss(output, target)
+            epoch_loss += loss.item()
+            loss.backward()
+            average_gradients(model)
+            optimizer.step()
+        # print(f"Rank {dist.get_rank()}, epoch {epoch}: {epoch_loss / num_batches}")
+
+    assert hook.get_worker_name() == f"worker_{dist.get_rank()}"
+    # Race condition here where both workers attempt to move
+    # /tmp/{out_dir}/END_OF_JOB.ts to {out_dir}/END_OF_JOB.ts
+    try:
+        hook._cleanup()
+    except FileNotFoundError:
+        pass
+
+def average_gradients(model):
+    """Gradient averaging."""
+    size = float(dist.get_world_size())
+    for param in model.parameters():
+        dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM)
+        param.grad.data /= size
+
+def init_processes(rank, size, fn, backend='gloo'):
+    """Initialize the distributed environment."""
+    os.environ['MASTER_ADDR'] = '127.0.0.1'
+    os.environ['MASTER_PORT'] = '29500'
+    dist.init_process_group(backend, rank=rank, world_size=size)
+    fn(rank, size)
+
+def test_run_net_single_process():
+    """Runs a single linear layer."""
+    ts.reset_collections()
+    device = torch.device('cpu')
+    model = Net().to(device)
+    optimizer = optim.SGD(model.parameters(), lr=0.01)
+
+    shutil.rmtree(out_dir, ignore_errors=True)
+    hook = ts.TornasoleHook(
+        out_dir=out_dir,
+        save_config=ts.SaveConfig(save_steps=[0, 1, 5]),
+        save_all=True,
+    )
+    hook.register_hook(model)
+    train(model=model, device=device, optimizer=optimizer)
+    hook._cleanup()
+
+    assert hook.get_worker_name() == "worker_0"
+
+    trial = create_trial(path=out_dir)
+    assert len(trial.workers()) == 1, f"trial.workers() = {trial.workers()}"
+    assert len(trial.steps()) == 3, f"trial.steps() = {trial.steps()}"
+    shutil.rmtree(out_dir, ignore_errors=True)
+
+def test_run_net_distributed():
+    """Runs a single linear layer on 2 processes."""
+    # torch.distributed is empty on Mac on Torch <= 1.2
+    if not hasattr(dist, 'is_initialized'):
+        return
+
+    ts.reset_collections()
+    size = 2
+    processes = []
+    for rank in range(size):
+        p = Process(target=init_processes, args=(rank, size, run))
+        p.start()
+        processes.append(p)
+
+    for p in processes:
+        p.join()
+
+    # WARNING: assert statements do not cause test failure inside subprocesses
+    # https://stackoverflow.com/questions/13400546/py-test-how-to-automatically-detect-an-exception-in-a-child-process
+    assert all([not p.exitcode for p in processes]), f"Some processes failed. processes={processes}"
+
+    out_dir = '/tmp/run'
+    trial = create_trial(path=out_dir)
+    assert len(trial.workers()) == 2, f"trial.workers() = {trial.workers()}"
+    assert len(trial.steps()) == 3, f"trial.steps() = {trial.steps()}"
diff --git a/tests/pytorch/test_loss.py b/tests/pytorch/test_loss.py
@@ -30,6 +30,7 @@ def forward(self, x):
 
 def test_register_loss():
     """Test that the loss is saved as a tensor."""
+    ts.reset_collections()
     out_dir = '/tmp/pytorch_test_loss'
     shutil.rmtree(out_dir, ignore_errors=True)
 
@@ -63,7 +64,7 @@ def test_register_loss():
     # (like we do here). Then it'll crash, likewise in a Jupyter notebook.
     hook._cleanup()
 
-    trial = create_trial(path=out_dir, name='run')
+    trial = create_trial(path=out_dir)
     loss_coll = hook.collection_manager.get('losses')
     assert len(loss_coll.get_tensor_names()) == 3
 
diff --git a/tests/pytorch/test_modes.py b/tests/pytorch/test_modes.py
@@ -7,7 +7,7 @@
 from torch.autograd import Variable
 from tornasole import modes, SaveConfig, SaveConfigMode
 from tornasole.pytorch.hook import *
-from tornasole.pytorch.torch_collection import *
+from tornasole.pytorch.collection import *
 from tornasole.pytorch import reset_collections
 from tornasole.core.json_config import TORNASOLE_CONFIG_FILE_PATH_ENV_STR
 import uuid
diff --git a/tests/pytorch/test_simple_write.py b/tests/pytorch/test_simple_write.py
@@ -8,7 +8,7 @@
 
 from tornasole import SaveConfig
 from tornasole.pytorch.hook import *
-from tornasole.pytorch.torch_collection import *
+from tornasole.pytorch.collection import *
 from tornasole.pytorch import reset_collections
 from tornasole.core.json_config import TORNASOLE_CONFIG_FILE_PATH_ENV_STR
 import uuid
diff --git a/tornasole/core/index_reader.py b/tornasole/core/index_reader.py
@@ -1,5 +1,7 @@
+import numpy as np
 import os
 import json
+from typing import Any, Dict, List, Tuple
 from tornasole.core.locations import TensorLocation, IndexFileLocationUtils
 from tornasole.core.s3_utils import list_s3_objects
 from tornasole.core.access_layer.s3handler import ReadObjectRequest, S3Handler
@@ -65,7 +67,14 @@ def list_index_files_in_dir(dirname):
         return sorted(index_files)
 
     @staticmethod
-    def get_disk_responses(path, start_after_key=0, range_steps=None):
+    def get_disk_responses(path, start_after_key=0, range_steps=None) -> Tuple[List[bytes], List[int], int]:
+        """Read files like `trial_{datetime}/index/000/{step}_{worker}.json.
+
+        Returns:
+            responses: List of the contents of each file, encoded as bytes.
+            steps: List of steps read.
+            start_after_key: An int referring where to start reading next time.
+        """
         index_files = LocalIndexReader.list_index_files_in_dir(path)
         steps = []
         workers = []
@@ -86,7 +95,7 @@ def get_disk_responses(path, start_after_key=0, range_steps=None):
 class IndexReader:
 
     @staticmethod
-    def fetch_tensor_value(tensor_location):
+    def fetch_tensor_value(tensor_location: TensorLocation) -> np.ndarray:
         event_file_name = tensor_location.event_file_name
         start = tensor_location.start_idx
         length = tensor_location.length
@@ -107,7 +116,8 @@ def fetch_tensor_value(tensor_location):
         return tensor_data
 
     @staticmethod
-    def load_tensor_data_from_index_files(path, start_after_key=None, range_steps=None):
+    def load_tensor_data_from_index_files(path, start_after_key=None, range_steps=None) -> Tuple[Dict[str, Dict[int, Dict[str, TensorLocation]]], int]:
+        """Return a triply nested dict referring to tensor data."""
         s3, bucket_name, prefix_name = is_s3(path)
         if s3:
             if start_after_key == 0:
@@ -130,7 +140,22 @@ def _validate(index_dict):
             raise IndexReaderException('tensor_payload section is not present')
 
     @staticmethod
-    def _update_tensors_from_json(index_tensors_dict, step, response, path, worker):
+    def _update_tensors_from_json(index_tensors_dict, step, response: bytes, path, worker) -> Dict[str, Dict[int, Dict[str, TensorLocation]]]:
+        """Return a triply nested dict referring to tensor data.
+
+        Example:
+        {
+            'dense/bias:0': {
+                0: {
+                    'tensor_location': <TensorLocation object>
+                },
+                2: { ... },
+                ...
+            },
+            'conv2d/kernel:0': { ... },
+            ...
+        }
+        """
         index_dict = json.loads(response)
         IndexReader._validate(index_dict)
         index_meta = index_dict['meta']
diff --git a/tornasole/pytorch/__init__.py b/tornasole/pytorch/__init__.py
@@ -1,7 +1,7 @@
 from .hook import TornasoleHook
-from .torch_collection import Collection, CollectionManager
+from .collection import Collection, CollectionManager
 
-from .torch_collection import get_collections, get_collection, \
+from .collection import get_collections, get_collection, \
   load_collections,  \
   add_to_collection, add_to_default_collection, reset_collections
 from tornasole import SaveConfig, SaveConfigMode, ReductionConfig
diff --git a/tornasole/pytorch/collection.py b/tornasole/pytorch/collection.py
@@ -62,4 +62,4 @@ def get_collection(collection_name):
     return _collection_manager.get(collection_name, create=True)
 
 def get_collections():
-    return _collection_manager.collections
+    return _collection_manager.collections
diff --git a/tornasole/pytorch/hook.py b/tornasole/pytorch/hook.py
@@ -1,10 +1,12 @@
+import importlib
 import torch
+import torch.distributed as dist
 import logging
 from tornasole.core.hook import CallbackHook
 from tornasole.core.collection import CollectionKeys
 from tornasole.core.logger import get_logger
 from tornasole.core.json_config import create_hook_from_json_config
-from tornasole.pytorch.torch_collection import get_collection_manager
+from tornasole.pytorch.collection import get_collection_manager
 from tornasole.pytorch.utils import get_reduction_of_data, make_numpy_array
 from tornasole.core.json_config import TORNASOLE_CONFIG_DEFAULT_WORKER_NAME
 
@@ -45,20 +47,38 @@ def __init__(self,
         self.module_maps = dict()
 
     def get_num_workers(self):
-        try:
-            import horovod.torch as hvd
-            if hvd.size():
-                return hvd.size()
-        except (ModuleNotFoundError, ValueError, ImportError):
-            return 1
+        """Check horovod and torch.distributed."""
+        # Try torch.distributed
+        # torch.distributed is empty on Mac on Torch <= 1.2
+        if hasattr(dist, 'is_initialized') and dist.is_initialized():
+            return torch.distributed.get_world_size()
+        # Try horovod
+        else:
+            try:
+                import horovod.torch as hvd
+                if hvd.size():
+                    return hvd.size()
+            except (ModuleNotFoundError, ValueError, ImportError):
+                pass
+        # Return default
+        return 1
 
     def get_worker_name(self):
-        try:
-            import horovod.torch as hvd
-            if hvd.size():
-                return f'worker_{hvd.rank()}'
-        except (ModuleNotFoundError, ValueError, ImportError):
-            return TORNASOLE_CONFIG_DEFAULT_WORKER_NAME
+        """Check horovod and torch.distributed."""
+        # Try torch.distributed
+        # torch.distributed is empty on Mac on Torch <= 1.2
+        if hasattr(dist, 'is_initialized') and dist.is_initialized():
+            return f"worker_{dist.get_rank()}"
+        # Try horovod
+        else:
+            try:
+                import horovod.torch as hvd
+                if hvd.size():
+                    return f"worker_{hvd.rank()}"
+            except (ModuleNotFoundError, ValueError, ImportError):
+                pass
+        # Return default
+        return TORNASOLE_CONFIG_DEFAULT_WORKER_NAME
 
     @classmethod
     def hook_from_config(cls):