use DDP to train MNIST in parallel

KWang1998 · KWang1998 · commit cf838ad61d13 · 2024-08-09T17:43:06.000-05:00
diff --git a/examples/MNIST/MNIST_codes/comm_file.py b/examples/MNIST/MNIST_codes/comm_file.py
@@ -0,0 +1,211 @@
+import os
+import torch
+import torch.distributed as dist
+from mpi4py import MPI
+
+class distributed():
+    def get_size(self):
+        if dist.is_available() and dist.is_initialized():
+            size = dist.get_world_size()
+        else:
+            size = 1
+        return size
+
+    def get_rank(self):
+        if dist.is_available() and dist.is_initialized():
+            rank = dist.get_rank()
+        else:
+            rank = 0
+        return rank
+
+    def get_local_rank(self):
+        if not (dist.is_available() and dist.is_initialized()):
+            return 0
+        # Number of GPUs per node
+        if torch.cuda.is_available():
+            local_rank = dist.get_rank() % torch.cuda.device_count()
+        else:
+            # raise NotImplementedError()
+            # running on cpu device should not call this function
+            local_rank = -1
+        return local_rank
+
+    def __init__(self, method):
+        # MASTER_PORT - required; has to be a free port on machine with rank 0
+        # MASTER_ADDR - required (except for rank 0); address of rank 0 node
+        # WORLD_SIZE - required; can be set either here, or in a call to init function
+        # RANK - required; can be set either here, or in a call to init function
+    
+        if method == "nccl-slurm":
+            # MASTER_ADDR can be set in the slurm batch script using command
+            # scontrol show hostnames $SLURM_JOB_NODELIST
+            if "MASTER_ADDR" not in os.environ:
+                # Try SLURM_LAUNCH_NODE_IPADDR but it is the IP address of the node
+                # from which the task launch was initiated (where the srun command
+                # ran from). It may not be the node of rank 0.
+                if "SLURM_LAUNCH_NODE_IPADDR" in os.environ:
+                    os.environ["MASTER_ADDR"] = os.environ["SLURM_LAUNCH_NODE_IPADDR"]
+                else:
+                    raise Exception("Error: nccl-slurm - SLURM_LAUNCH_NODE_IPADDR is not set")
+    
+            # Use the default pytorch port
+            if "MASTER_PORT" not in os.environ:
+                if "SLURM_SRUN_COMM_PORT" in os.environ:
+                    os.environ["MASTER_PORT"] = os.environ["SLURM_SRUN_COMM_PORT"]
+                else:
+                    os.environ["MASTER_PORT"] = "29500"
+    
+            # obtain WORLD_SIZE
+            if "WORLD_SIZE" not in os.environ:
+                if "SLURM_NTASKS" in os.environ:
+                    world_size = os.environ["SLURM_NTASKS"]
+                else:
+                    if "SLURM_JOB_NUM_NODES" in os.environ:
+                        num_nodes = os.environ["SLURM_JOB_NUM_NODES"]
+                    else:
+                        raise Exception("Error: nccl-slurm - SLURM_JOB_NUM_NODES is not set")
+                    if "SLURM_NTASKS_PER_NODE" in os.environ:
+                        ntasks_per_node = os.environ["SLURM_NTASKS_PER_NODE"]
+                    elif "SLURM_TASKS_PER_NODE" in os.environ:
+                        ntasks_per_node = os.environ["SLURM_TASKS_PER_NODE"]
+                    else:
+                        raise Exception("Error: nccl-slurm - SLURM_(N)TASKS_PER_NODE is not set")
+                    world_size = ntasks_per_node * num_nodes
+                os.environ["WORLD_SIZE"] = str(world_size)
+    
+            # obtain RANK
+            if "RANK" not in os.environ:
+                if "SLURM_PROCID" in os.environ:
+                    os.environ["RANK"] = os.environ["SLURM_PROCID"]
+                else:
+                    raise Exception("Error: nccl-slurm - SLURM_PROCID is not set")
+    
+            # Initialize DDP module
+            dist.init_process_group(backend = "nccl", init_method='env://')
+    
+        elif method == "nccl-openmpi":
+            if "MASTER_ADDR" not in os.environ:
+                if "PMIX_SERVER_URI2" in os.environ:
+                    os.environ["MASTER_ADDR"] = os.environ("PMIX_SERVER_URI2").split("//")[1]
+                else:
+                    raise Exception("Error: nccl-openmpi - PMIX_SERVER_URI2 is not set")
+    
+            # Use the default pytorch port
+            if "MASTER_PORT" not in os.environ:
+                os.environ["MASTER_PORT"] = "29500"
+    
+            if "WORLD_SIZE" not in os.environ:
+                if "OMPI_COMM_WORLD_SIZE" not in os.environ:
+                    raise Exception("Error: nccl-openmpi - OMPI_COMM_WORLD_SIZE is not set")
+                os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
+    
+            if "RANK" not in os.environ:
+                if "OMPI_COMM_WORLD_RANK" not in os.environ:
+                    raise Exception("Error: nccl-openmpi - OMPI_COMM_WORLD_RANK is not set")
+                os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
+    
+            # Initialize DDP module
+            dist.init_process_group(backend = "nccl", init_method='env://')
+    
+        elif method == "nccl-mpich":
+            if "MASTER_ADDR" not in os.environ:
+                os.environ['MASTER_ADDR'] = "localhost"
+    
+            # Use the default pytorch port
+            if "MASTER_PORT" not in os.environ:
+                os.environ["MASTER_PORT"] = "29500"
+    
+            if "WORLD_SIZE" not in os.environ:
+                if "PMI_SIZE" in os.environ:
+                    world_size = os.environ["PMI_SIZE"]
+                elif MPI.Is_initialized():
+                    world_size = MPI.COMM_WORLD.Get_size()
+                else:
+                    world_size = 1
+                os.environ["WORLD_SIZE"] = str(world_size)
+    
+            if "RANK" not in os.environ:
+                if "PMI_RANK" in os.environ:
+                    rank = os.environ["PMI_RANK"]
+                elif MPI.Is_initialized():
+                    rank = MPI.COMM_WORLD.Get_rank()
+                else:
+                    rank = 0
+                os.environ["RANK"] = str(rank)
+
+            # Initialize DDP module
+            dist.init_process_group(backend = "nccl", init_method='env://')
+    
+        elif method == "gloo":
+            if "MASTER_ADDR" not in os.environ:
+                # check if OpenMPI is used
+                if "PMIX_SERVER_URI2" in os.environ:
+                    addr = os.environ["PMIX_SERVER_URI2"]
+                    addr = addr.split("//")[1].split(":")[0]
+                    os.environ["MASTER_ADDR"] = addr
+                else:
+                    os.environ['MASTER_ADDR'] = "localhost"
+    
+            # Use the default pytorch port
+            if "MASTER_PORT" not in os.environ:
+                os.environ["MASTER_PORT"] = "29500"
+    
+            # obtain WORLD_SIZE
+            if "WORLD_SIZE" not in os.environ:
+                # check if OpenMPI is used
+                if "OMPI_COMM_WORLD_SIZE" in os.environ:
+                    world_size = os.environ["OMPI_COMM_WORLD_SIZE"]
+                elif "PMI_SIZE" in os.environ:
+                    world_size = os.environ["PMI_SIZE"]
+                elif MPI.Is_initialized():
+                    world_size = MPI.COMM_WORLD.Get_size()
+                else:
+                    world_size = 1
+                os.environ["WORLD_SIZE"] = str(world_size)
+    
+            # obtain RANK
+            if "RANK" not in os.environ:
+                # check if OpenMPI is used
+                if "OMPI_COMM_WORLD_RANK" in os.environ:
+                    rank = os.environ["OMPI_COMM_WORLD_RANK"]
+                elif "PMI_RANK" in os.environ:
+                    rank = os.environ["PMI_RANK"]
+                elif MPI.Is_initialized():
+                    rank = MPI.COMM_WORLD.Get_rank()
+                else:
+                    rank = 0
+                os.environ["RANK"] = str(rank)
+    
+            # Initialize DDP module
+            dist.init_process_group(backend = "gloo", init_method='env://')
+    
+        else:
+            raise NotImplementedError()
+    
+    def finalize(self):
+        dist.destroy_process_group()
+
+#----< init_parallel() >-------------------------------------------------------
+def init_parallel():
+    # check if cuda device is available
+    ngpu_per_node = torch.cuda.device_count()
+    if not torch.cuda.is_available():
+        backend = "gloo"
+    else:
+        backend = "nccl-mpich"
+
+    # initialize parallel/distributed environment
+    comm = distributed(backend)
+    rank = comm.get_rank()
+    world_size = comm.get_size()
+    local_rank = comm.get_local_rank()
+
+    # select training device: cpu or cuda
+    if not torch.cuda.is_available():
+        device = torch.device("cpu")
+    else:
+        device = torch.device("cuda:"+str(local_rank))
+
+    return comm, device
+
+
diff --git a/examples/MNIST/MNIST_codes/main.py b/examples/MNIST/MNIST_codes/main.py
@@ -5,7 +5,9 @@
 import torch.optim as optim
 from torchvision import datasets, transforms
 from torch.optim.lr_scheduler import StepLR
-
+import comm_file
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed import ReduceOp, all_reduce
 
 class Net(nn.Module):
     def __init__(self):
@@ -33,40 +35,62 @@ def forward(self, x):
         return output
 
 
-def train(args, model, device, train_loader, optimizer, epoch):
+def train(args, model, device, train_loader, optimizer, epoch, comm):
     model.train()
+    total_loss = 0.0
+    num_batches = 0
     for batch_idx, (data, target) in enumerate(train_loader):
         data, target = data.to(device), target.to(device)
         optimizer.zero_grad()
         output = model(data)
         loss = F.nll_loss(output, target)
         loss.backward()
         optimizer.step()
-        if batch_idx % args.log_interval == 0:
-            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
-                epoch, batch_idx * len(data), len(train_loader.dataset),
-                100. * batch_idx / len(train_loader), loss.item()))
-            if args.dry_run:
-                break
-
-
-def test(model, device, test_loader):
+        
+        total_loss += loss.item()
+        num_batches += 1
+        
+    # Compute the average loss for the current epoch
+    avg_loss = total_loss / num_batches
+    
+    # Reduce the average loss across all processes
+    avg_loss_tensor = torch.tensor(avg_loss, device=device)
+    all_reduce(avg_loss_tensor, op=ReduceOp.SUM)
+    avg_loss_tensor /= comm.get_size()
+
+    # Print the average loss only from the master process
+    if comm.get_rank() == 0:
+        print(f'Train Epoch: {epoch}\tAverage Loss: {avg_loss_tensor.item():.6f}')
+
+
+def test(model, device, test_loader, comm):
     model.eval()
     test_loss = 0
     correct = 0
+    total_samples = 0
     with torch.no_grad():
         for data, target in test_loader:
             data, target = data.to(device), target.to(device)
             output = model(data)
             test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
             pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
             correct += pred.eq(target.view_as(pred)).sum().item()
-
-    test_loss /= len(test_loader.dataset)
-
-    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
-        test_loss, correct, len(test_loader.dataset),
-        100. * correct / len(test_loader.dataset)))
+            total_samples += data.size(0)
+
+    test_loss_tensor = torch.tensor(test_loss, device=device)
+    correct_tensor = torch.tensor(correct, device=device)
+    total_samples_tensor = torch.tensor(total_samples, device=device)
+    all_reduce(test_loss_tensor, op=ReduceOp.SUM)
+    all_reduce(correct_tensor, op=ReduceOp.SUM)
+    all_reduce(total_samples_tensor, op=ReduceOp.SUM)
+    test_loss = test_loss_tensor.item()
+    correct = correct_tensor.item()
+    total_samples = total_samples_tensor.item()
+    avg_loss = test_loss / total_samples
+    accuracy = 100. * correct / total_samples
+    
+    if comm.get_rank() == 0:
+        print(f'Test set: Average loss: {avg_loss:.4f}, Accuracy: {correct}/{total_samples} ({accuracy:.0f}%)\n')
 
 
 def main():
@@ -100,45 +124,59 @@ def main():
 
     torch.manual_seed(args.seed)
 
-    if use_cuda:
-        device = torch.device("cuda")
-    elif use_mps:
-        device = torch.device("mps")
-    else:
-        device = torch.device("cpu")
+    ## init comm, rank, nprocs
+    comm, device = comm_file.init_parallel()
+    
+    rank = comm.get_rank()
+    nprocs = comm.get_size()
+
+    print("nprocs = ", nprocs, " rank = ",rank," device = ", device)
 
     train_kwargs = {'batch_size': args.batch_size}
     test_kwargs = {'batch_size': args.test_batch_size}
     if use_cuda:
         cuda_kwargs = {'num_workers': 1,
                        'pin_memory': True,
-                       'shuffle': True}
+                       'shuffle': False}
         train_kwargs.update(cuda_kwargs)
         test_kwargs.update(cuda_kwargs)
 
     transform=transforms.Compose([
         transforms.ToTensor(),
         transforms.Normalize((0.1307,), (0.3081,))
         ])
-    dataset1 = datasets.MNIST('../data', train=True, download=True,
+    dataset1 = datasets.MNIST('../MNIST_data', train=True, download=True,
                        transform=transform)
-    dataset2 = datasets.MNIST('../data', train=False,
+    dataset2 = datasets.MNIST('../MNIST_data', train=False,
                        transform=transform)
-    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
-    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
+    
+    # add train distributed sampler
+    train_sampler = torch.utils.data.distributed.DistributedSampler(dataset1, num_replicas=comm.get_size(), rank=comm.get_rank(), shuffle=True)
+    test_sampler = torch.utils.data.distributed.DistributedSampler(dataset2, num_replicas=comm.get_size(), rank=comm.get_rank(), shuffle=False)
+    train_loader = torch.utils.data.DataLoader(dataset1, sampler=train_sampler, **train_kwargs)
+    test_loader = torch.utils.data.DataLoader(dataset2, sampler=test_sampler, **test_kwargs, drop_last=False)
 
     model = Net().to(device)
+    # add to use DDP
+    model = DDP(model, device_ids=[device] if use_cuda else None)
     optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
 
     scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
     for epoch in range(1, args.epochs + 1):
-        train(args, model, device, train_loader, optimizer, epoch)
-        test(model, device, test_loader)
+        # train sampler set epoch
+        train_sampler.set_epoch(epoch)
+        test_sampler.set_epoch(epoch)
+        
+        train(args, model, device, train_loader, optimizer, epoch, comm)
+        test(model, device, test_loader, comm)
         scheduler.step()
 
     if args.save_model:
-        torch.save(model.state_dict(), "mnist_cnn.pt")
+        if rank == 0:
+            torch.save(model.state_dict(), "mnist_cnn.pt")
+    
+    comm.finalize()
 
 
 if __name__ == '__main__':
-    main()
+    main()