Parallel-NetCDF
diff --git a/‎examples/MNIST/Makefile
Lines changed: 25 additions & 0 deletions b/‎examples/MNIST/Makefile
Lines changed: 25 additions & 0 deletions
diff --git a/‎examples/MNIST/comm_file.py
Lines changed: 212 additions & 0 deletions b/‎examples/MNIST/comm_file.py
Lines changed: 212 additions & 0 deletions
diff --git a/‎examples/MNIST/mnist.patch
Lines changed: 134 additions & 0 deletions b/‎examples/MNIST/mnist.patch
Lines changed: 134 additions & 0 deletions
diff --git a/‎examples/MNIST/mnist_images.nc
55.7 KB b/‎examples/MNIST/mnist_images.nc
55.7 KB
@@ -0,0 +1,25 @@
+#
+# Copyright (C) 2024, Northwestern University and Argonne National Laboratory
+# See COPYRIGHT notice in top-level directory.
+#
+
+check_PROGRAMS = mnist_main.py
+
+MNIST_URL = https://raw.githubusercontent.com/pytorch/examples/main/mnist/main.py
+
+mnist_main.py:
+	curl -Ls $(MNIST_URL) -o $@
+	patch -st $@ < mnist.patch
+
+all:
+
+ptests check: mnist_main.py mnist_images.nc
+	@echo "======================================================================"
+	@echo "    examples/MNIST: Parallel testing on 4 MPI processes"
+	@echo "======================================================================"
+	@mpiexec -n 4 python mnist_main.py --batch-size 4 --test-batch-size 2 --epochs 3 --input-file mnist_images.nc
+	@echo ""
+
+clean:
+	rm -rf mnist_main.py
+
@@ -0,0 +1,212 @@
+import os
+import torch
+import torch.distributed as dist
+from mpi4py import MPI
+
+class distributed():
+    def get_size(self):
+        if dist.is_available() and dist.is_initialized():
+            size = dist.get_world_size()
+        else:
+            size = 1
+        return size
+
+    def get_rank(self):
+        if dist.is_available() and dist.is_initialized():
+            rank = dist.get_rank()
+        else:
+            rank = 0
+        return rank
+
+    def get_local_rank(self):
+        if not (dist.is_available() and dist.is_initialized()):
+            return 0
+        # Number of GPUs per node
+        if torch.cuda.is_available():
+            local_rank = dist.get_rank() % torch.cuda.device_count()
+        else:
+            # raise NotImplementedError()
+            # running on cpu device should not call this function
+            local_rank = -1
+        return local_rank
+
+    def __init__(self, method):
+        # MASTER_PORT - required; has to be a free port on machine with rank 0
+        # MASTER_ADDR - required (except for rank 0); address of rank 0 node
+        # WORLD_SIZE - required; can be set either here, or in a call to init function
+        # RANK - required; can be set either here, or in a call to init function
+        self.mpi_comm = MPI.COMM_WORLD
+
+        if method == "nccl-slurm":
+            # MASTER_ADDR can be set in the slurm batch script using command
+            # scontrol show hostnames $SLURM_JOB_NODELIST
+            if "MASTER_ADDR" not in os.environ:
+                # Try SLURM_LAUNCH_NODE_IPADDR but it is the IP address of the node
+                # from which the task launch was initiated (where the srun command
+                # ran from). It may not be the node of rank 0.
+                if "SLURM_LAUNCH_NODE_IPADDR" in os.environ:
+                    os.environ["MASTER_ADDR"] = os.environ["SLURM_LAUNCH_NODE_IPADDR"]
+                else:
+                    raise Exception("Error: nccl-slurm - SLURM_LAUNCH_NODE_IPADDR is not set")
+    
+            # Use the default pytorch port
+            if "MASTER_PORT" not in os.environ:
+                if "SLURM_SRUN_COMM_PORT" in os.environ:
+                    os.environ["MASTER_PORT"] = os.environ["SLURM_SRUN_COMM_PORT"]
+                else:
+                    os.environ["MASTER_PORT"] = "29500"
+    
+            # obtain WORLD_SIZE
+            if "WORLD_SIZE" not in os.environ:
+                if "SLURM_NTASKS" in os.environ:
+                    world_size = os.environ["SLURM_NTASKS"]
+                else:
+                    if "SLURM_JOB_NUM_NODES" in os.environ:
+                        num_nodes = os.environ["SLURM_JOB_NUM_NODES"]
+                    else:
+                        raise Exception("Error: nccl-slurm - SLURM_JOB_NUM_NODES is not set")
+                    if "SLURM_NTASKS_PER_NODE" in os.environ:
+                        ntasks_per_node = os.environ["SLURM_NTASKS_PER_NODE"]
+                    elif "SLURM_TASKS_PER_NODE" in os.environ:
+                        ntasks_per_node = os.environ["SLURM_TASKS_PER_NODE"]
+                    else:
+                        raise Exception("Error: nccl-slurm - SLURM_(N)TASKS_PER_NODE is not set")
+                    world_size = ntasks_per_node * num_nodes
+                os.environ["WORLD_SIZE"] = str(world_size)
+    
+            # obtain RANK
+            if "RANK" not in os.environ:
+                if "SLURM_PROCID" in os.environ:
+                    os.environ["RANK"] = os.environ["SLURM_PROCID"]
+                else:
+                    raise Exception("Error: nccl-slurm - SLURM_PROCID is not set")
+    
+            # Initialize DDP module
+            dist.init_process_group(backend = "nccl", init_method='env://')
+    
+        elif method == "nccl-openmpi":
+            if "MASTER_ADDR" not in os.environ:
+                if "PMIX_SERVER_URI2" in os.environ:
+                    os.environ["MASTER_ADDR"] = os.environ("PMIX_SERVER_URI2").split("//")[1]
+                else:
+                    raise Exception("Error: nccl-openmpi - PMIX_SERVER_URI2 is not set")
+    
+            # Use the default pytorch port
+            if "MASTER_PORT" not in os.environ:
+                os.environ["MASTER_PORT"] = "29500"
+    
+            if "WORLD_SIZE" not in os.environ:
+                if "OMPI_COMM_WORLD_SIZE" not in os.environ:
+                    raise Exception("Error: nccl-openmpi - OMPI_COMM_WORLD_SIZE is not set")
+                os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
+    
+            if "RANK" not in os.environ:
+                if "OMPI_COMM_WORLD_RANK" not in os.environ:
+                    raise Exception("Error: nccl-openmpi - OMPI_COMM_WORLD_RANK is not set")
+                os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
+    
+            # Initialize DDP module
+            dist.init_process_group(backend = "nccl", init_method='env://')
+    
+        elif method == "nccl-mpich":
+            if "MASTER_ADDR" not in os.environ:
+                os.environ['MASTER_ADDR'] = "localhost"
+    
+            # Use the default pytorch port
+            if "MASTER_PORT" not in os.environ:
+                os.environ["MASTER_PORT"] = "29500"
+    
+            if "WORLD_SIZE" not in os.environ:
+                if "PMI_SIZE" in os.environ:
+                    world_size = os.environ["PMI_SIZE"]
+                elif MPI.Is_initialized():
+                    world_size = MPI.COMM_WORLD.Get_size()
+                else:
+                    world_size = 1
+                os.environ["WORLD_SIZE"] = str(world_size)
+    
+            if "RANK" not in os.environ:
+                if "PMI_RANK" in os.environ:
+                    rank = os.environ["PMI_RANK"]
+                elif MPI.Is_initialized():
+                    rank = MPI.COMM_WORLD.Get_rank()
+                else:
+                    rank = 0
+                os.environ["RANK"] = str(rank)
+
+            # Initialize DDP module
+            dist.init_process_group(backend = "nccl", init_method='env://')
+    
+        elif method == "gloo":
+            if "MASTER_ADDR" not in os.environ:
+                # check if OpenMPI is used
+                if "PMIX_SERVER_URI2" in os.environ:
+                    addr = os.environ["PMIX_SERVER_URI2"]
+                    addr = addr.split("//")[1].split(":")[0]
+                    os.environ["MASTER_ADDR"] = addr
+                else:
+                    os.environ['MASTER_ADDR'] = "localhost"
+    
+            # Use the default pytorch port
+            if "MASTER_PORT" not in os.environ:
+                os.environ["MASTER_PORT"] = "29500"
+    
+            # obtain WORLD_SIZE
+            if "WORLD_SIZE" not in os.environ:
+                # check if OpenMPI is used
+                if "OMPI_COMM_WORLD_SIZE" in os.environ:
+                    world_size = os.environ["OMPI_COMM_WORLD_SIZE"]
+                elif "PMI_SIZE" in os.environ:
+                    world_size = os.environ["PMI_SIZE"]
+                elif MPI.Is_initialized():
+                    world_size = MPI.COMM_WORLD.Get_size()
+                else:
+                    world_size = 1
+                os.environ["WORLD_SIZE"] = str(world_size)
+    
+            # obtain RANK
+            if "RANK" not in os.environ:
+                # check if OpenMPI is used
+                if "OMPI_COMM_WORLD_RANK" in os.environ:
+                    rank = os.environ["OMPI_COMM_WORLD_RANK"]
+                elif "PMI_RANK" in os.environ:
+                    rank = os.environ["PMI_RANK"]
+                elif MPI.Is_initialized():
+                    rank = MPI.COMM_WORLD.Get_rank()
+                else:
+                    rank = 0
+                os.environ["RANK"] = str(rank)
+    
+            # Initialize DDP module
+            dist.init_process_group(backend = "gloo", init_method='env://')
+    
+        else:
+            raise NotImplementedError()
+    
+    def finalize(self):
+        dist.destroy_process_group()
+
+#----< init_parallel() >-------------------------------------------------------
+def init_parallel():
+    # check if cuda device is available
+    ngpu_per_node = torch.cuda.device_count()
+    if not torch.cuda.is_available():
+        backend = "gloo"
+    else:
+        backend = "nccl-mpich"
+
+    # initialize parallel/distributed environment
+    comm = distributed(backend)
+    rank = comm.get_rank()
+    world_size = comm.get_size()
+    local_rank = comm.get_local_rank()
+
+    # select training device: cpu or cuda
+    if not torch.cuda.is_available():
+        device = torch.device("cpu")
+    else:
+        device = torch.device("cuda:"+str(local_rank))
+
+    return comm, device
+
+
@@ -0,0 +1,134 @@
+--- mnist_main_original.py	2024-08-10 17:30:08.552324326 -0500
++++ pnetcdf_mnist.py	2024-08-10 18:02:49.008705003 -0500
+@@ -1,3 +1,8 @@
++#
++# Copyright (C) 2024, Northwestern University and Argonne National Laboratory
++# See COPYRIGHT notice in top-level directory.
++#
++
+ import argparse
+ import torch
+ import torch.nn as nn
+@@ -5,7 +10,11 @@
+ import torch.optim as optim
+ from torchvision import datasets, transforms
+ from torch.optim.lr_scheduler import StepLR
++from torch.nn.parallel import DistributedDataParallel as DDP
++from torch.utils.data.distributed import DistributedSampler
+ 
++import comm_file, pnetcdf_io
++from mpi4py import MPI
+ 
+ class Net(nn.Module):
+     def __init__(self):
+@@ -42,14 +51,13 @@
+         loss = F.nll_loss(output, target)
+         loss.backward()
+         optimizer.step()
+-        if batch_idx % args.log_interval == 0:
++        if rank == 0 and batch_idx % args.log_interval == 0:
+             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                 epoch, batch_idx * len(data), len(train_loader.dataset),
+                 100. * batch_idx / len(train_loader), loss.item()))
+             if args.dry_run:
+                 break
+ 
+-
+ def test(model, device, test_loader):
+     model.eval()
+     test_loss = 0
+@@ -62,9 +70,14 @@
+             pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+             correct += pred.eq(target.view_as(pred)).sum().item()
+ 
++    # aggregate loss among all ranks
++    test_loss = comm.mpi_comm.allreduce(test_loss, op=MPI.SUM)
++    correct = comm.mpi_comm.allreduce(correct, op=MPI.SUM)
++
+     test_loss /= len(test_loader.dataset)
+ 
+-    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
++    if rank == 0:
++        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+         test_loss, correct, len(test_loader.dataset),
+         100. * correct / len(test_loader.dataset)))
+ 
+@@ -94,6 +107,8 @@
+                         help='how many batches to wait before logging training status')
+     parser.add_argument('--save-model', action='store_true', default=False,
+                         help='For Saving the current Model')
++    parser.add_argument('--input-file', type=str, required=True,
++                        help='NetCDF file storing train and test samples')
+     args = parser.parse_args()
+     use_cuda = not args.no_cuda and torch.cuda.is_available()
+     use_mps = not args.no_mps and torch.backends.mps.is_available()
+@@ -107,7 +122,7 @@
+     else:
+         device = torch.device("cpu")
+ 
+-    train_kwargs = {'batch_size': args.batch_size}
++    train_kwargs = {'batch_size': args.batch_size//nprocs}
+     test_kwargs = {'batch_size': args.test_batch_size}
+     if use_cuda:
+         cuda_kwargs = {'num_workers': 1,
+@@ -120,25 +135,53 @@
+         transforms.ToTensor(),
+         transforms.Normalize((0.1307,), (0.3081,))
+         ])
+-    dataset1 = datasets.MNIST('../data', train=True, download=True,
+-                       transform=transform)
+-    dataset2 = datasets.MNIST('../data', train=False,
+-                       transform=transform)
+-    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
+-    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
++
++    # Open files storing training and testing samples
++    infile = args.input_file
++    train_file = pnetcdf_io.dataset(infile, 'train_images', 'train_labels', transform, comm.mpi_comm)
++    test_file = pnetcdf_io.dataset(infile, 'test_images', 'test_labels', transform, comm.mpi_comm)
++
++    # create distributed samplers
++    train_sampler = DistributedSampler(train_file, num_replicas=nprocs, rank=rank, shuffle=True)
++    test_sampler = DistributedSampler(test_file, num_replicas=nprocs, rank=rank, shuffle=False)
++
++    # add distributed samplers to DataLoaders
++    train_loader = torch.utils.data.DataLoader(train_file, sampler=train_sampler, **train_kwargs)
++    test_loader = torch.utils.data.DataLoader(test_file, sampler=test_sampler, **test_kwargs, drop_last=False)
+ 
+     model = Net().to(device)
++
++    # use DDP
++    model = DDP(model, device_ids=[device] if use_cuda else None)
++
+     optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
+ 
+     scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
+     for epoch in range(1, args.epochs + 1):
++        # train sampler set epoch
++        train_sampler.set_epoch(epoch)
++        test_sampler.set_epoch(epoch)
++
+         train(args, model, device, train_loader, optimizer, epoch)
+         test(model, device, test_loader)
+         scheduler.step()
+ 
+     if args.save_model:
+-        torch.save(model.state_dict(), "mnist_cnn.pt")
++        if rank == 0:
++            torch.save(model.state_dict(), "mnist_cnn.pt")
+ 
++    # close files
++    train_file.close()
++    test_file.close()
+ 
+ if __name__ == '__main__':
++    ## initialize parallel environment
++    comm, device = comm_file.init_parallel()
++
++    rank = comm.get_rank()
++    nprocs = comm.get_size()
++
+     main()
++
++    comm.finalize()
++