revise README.md and Makefile

wkliao · wkliao · commit 25e92d9ab5db · 2024-08-11T17:10:47.000-05:00
diff --git a/examples/MNIST/Makefile b/examples/MNIST/Makefile
@@ -8,8 +8,37 @@ check_PROGRAMS = mnist_main.py
 MNIST_URL = https://raw.githubusercontent.com/pytorch/examples/main/mnist/main.py
 
 mnist_main.py:
-	curl -Ls $(MNIST_URL) -o $@
-	patch -st $@ < mnist.patch
+	@curl -Ls $(MNIST_URL) -o $@
+	@patch -st $@ < mnist.patch
+
+# https://yann.lecun.com/exdb/mnist
+MNIST_DATA_URL = https://github.com/golbin/TensorFlow-MNIST/raw/master/mnist/data
+
+MNIST_DATASETS = train-images-idx3-ubyte \
+                 train-labels-idx1-ubyte \
+                 t10k-images-idx3-ubyte \
+                 t10k-labels-idx1-ubyte
+
+MNIST_DATASETS_GZ = $(MNIST_DATASETS:=.gz)
+
+train-images-idx3-ubyte:
+	@curl -LOs $(MNIST_DATA_URL)/$@.gz
+	@gunzip $@.gz
+
+train-labels-idx1-ubyte:
+	@curl -LOs $(MNIST_DATA_URL)/$@.gz
+	@gunzip $@.gz
+
+t10k-images-idx3-ubyte:
+	@curl -LOs $(MNIST_DATA_URL)/$@.gz
+	@gunzip $@.gz
+
+t10k-labels-idx1-ubyte:
+	@curl -LOs $(MNIST_DATA_URL)/$@.gz
+	@gunzip $@.gz
+
+mnist_images.nc: $(MNIST_DATASETS)
+	@python create_mnist_netcdf.py
 
 all:
 
@@ -21,5 +50,8 @@ ptests check: mnist_main.py mnist_images.nc
 	@echo ""
 
 clean:
-	rm -rf mnist_main.py
+	rm -f mnist_main.py
+	rm -f $(MNIST_DATASETS)
+	rm -f $(MNIST_DATASETS_GZ)
+	rm -f mnist_images.nc
 
diff --git a/examples/MNIST/README.md b/examples/MNIST/README.md
@@ -1,45 +1,109 @@
-# PnetCDF-python MNIST example
+# MNIST example using PnetCDF-Python to Read Input Data
 
-This directory contains the description and run instructions for the MNIST example Python programs that utilize PnetCDF for file I/O and parallel training with MNIST data.
-
-## Directory Structure
-
-- **MNIST_data**: This folder contains a mini MNIST test dataset stored in a NetCDF file (`mnist_images_mini.nc`). The file includes:
-  - 60 training samples
-  - 12 testing samples
-
-- **MNIST_codes**: This folder contains the example MNIST training code. The example code is based on the [PyTorch MNIST example](https://github.com/pytorch/examples/tree/main/mnist) and uses `DistributedDataParallel` for parallel training.
+This directory contains files for running the Pytorch example program,
+[MNIST](https://github.com/pytorch/examples/tree/main/mnist),
+using Pytorch module `DistributedDataParallel` for parallel training and
+`PnetCDF-Python` for reading data from a NetCDF files.
 
+---
 ## Running the MNIST Example Program
 
-To run the MNIST example program, use the `mpiexec` command. The example below runs the program on 4 MPI processes.
-
-### Command:
-
-```sh
-mpiexec -n 4 python main.py
-```
-
-### Expected Output:
-
-When using 4 MPI processes, the output is expected to be similar to the following:
-
-```sh
-nprocs = 4  rank = 0  device = cpu  mpi_size = 4  mpi_rank = 0
-nprocs = 4  rank = 2  device = cpu  mpi_size = 4  mpi_rank = 2
-nprocs = 4  rank = 1  device = cpu  mpi_size = 4  mpi_rank = 1
-nprocs = 4  rank = 3  device = cpu  mpi_size = 4  mpi_rank = 3
-
-Train Epoch: 1  Average Loss: 2.288340
-Test set: Average loss: 2.7425, Accuracy: 0/12 (0%)
-
-Train Epoch: 2  Average Loss: 2.490800
-Test set: Average loss: 1.9361, Accuracy: 6/12 (50%)
-
-Train Epoch: 3  Average Loss: 2.216520
-Test set: Average loss: 1.8703, Accuracy: 7/12 (58%)
-```
-
+* Firstly, run commands below to generate the python program file and NetCDF file.
+  ```sh
+  make mnist_main.py`
+  make mnist_images.nc`
+  ```
+* Run command below to train the model using 4 MPI processes.
+  ```sh
+  mpiexec -n 4 python mnist_main.py --batch-size 4 --test-batch-size 2 --epochs 3 --input-file mnist_images.nc
+  ```
+
+## Testing
+* Command `make check` will do the following.
+  + Downloads the python source codes
+    [main.py](https://github.com/pytorch/examples/blob/main/mnist/main.py)
+    from [Pytorch Examples](https://github.com/pytorch/examples) as file
+    `mnist_main.py`.
+  + Applies patch file [mnist.patch](./mnist.patch) to `mnist_main.py`.
+  + Downloads the MNIST data sets from []()
+  + Run utility program  [create_mnist_netcdf.py](./create_mnist_netcdf.py)
+    to extract a subset of images into a NetCDF file.
+  + Run the training program `mnist_main.py`.
+
+* Testing output shown on screen.
+  ```
+  =====================================================================
+      examples/MNIST: Parallel testing on 4 MPI processes
+  ======================================================================
+  Train Epoch: 1 [0/60 (0%)]	Loss: 2.514259
+  Train Epoch: 1 [10/60 (67%)]	Loss: 1.953820
+
+  Test set: Average loss: 2.2113, Accuracy: 4/12 (33%)
+
+  Train Epoch: 2 [0/60 (0%)]	Loss: 2.359334
+  Train Epoch: 2 [10/60 (67%)]	Loss: 2.092178
+
+  Test set: Average loss: 1.4825, Accuracy: 6/12 (50%)
+
+  Train Epoch: 3 [0/60 (0%)]	Loss: 2.067438
+  Train Epoch: 3 [10/60 (67%)]	Loss: 0.010670
+
+  Test set: Average loss: 1.2531, Accuracy: 7/12 (58%)
+  ```
+
+## mnist_main.py command-line options
+  ```
+  -h, --help            show this help message and exit
+  --batch-size N        input batch size for training (default: 64)
+  --test-batch-size N   input batch size for testing (default: 1000)
+  --epochs N            number of epochs to train (default: 14)
+  --lr LR               learning rate (default: 1.0)
+  --gamma M             Learning rate step gamma (default: 0.7)
+  --no-cuda             disables CUDA training
+  --no-mps              disables macOS GPU training
+  --dry-run             quickly check a single pass
+  --seed S              random seed (default: 1)
+  --log-interval N      how many batches to wait before logging training status
+  --save-model          For Saving the current Model
+  --input-file INPUT_FILE
+                        NetCDF file storing train and test samples
+  ```
+
+## create_mnist_netcdf.py command-line options
+  ```
+    -h, --help            show this help message and exit
+    --verbose             Verbose mode
+    --train-size N        Number of training samples extracted from the input file (default: 60)
+    --test-size N         Number of testing samples extracted from the input file (default: 12)
+    --train-data-file TRAIN_DATA_FILE
+                          (Optional) input file name of training data
+    --train-label-file TRAIN_LABEL_FILE
+                          (Optional) input file name of training labels
+    --test-data-file TEST_DATA_FILE
+                          (Optional) input file name of testing data
+    --test-label-file TEST_LABEL_FILE
+                          (Optional) input file name of testing labels
+  ```
+
+---
+## Files in this directory
+* [mnist.patch](./mnist.patch) --
+  a patch file to be applied on
+  [main.py](https://github.com/pytorch/examples/blob/main/mnist/main.py)
+  once downloaded from [Pytorch Examples](https://github.com/pytorch/examples)
+  before running the model training.
+
+* [comm_file.py](./comm_file.py) --
+  implements the parallel environment for training the model in parallel.
+
+* [pnetcdf_io.py](./pnetcdf_io.py) --
+  implements the file I/O using PnetCDF-Python.
+
+* [create_mnist_netcdf.py](./create_mnist_netcdf.py) --
+  a utility python program that reads the MINST files, extract a subset of the
+  samples, and stores them into a newly created file in NetCDF format.
+
+---
 ### Notes:
 - The test set accuracy may vary slightly depending on how the data is distributed across the MPI processes.
 - The accuracy and loss reported after each epoch are averaged across all MPI processes.
diff --git a/examples/MNIST/mnist.patch b/examples/MNIST/mnist.patch
@@ -1,5 +1,5 @@
 --- mnist_main_original.py	2024-08-10 17:30:08.552324326 -0500
-+++ pnetcdf_mnist.py	2024-08-10 18:02:49.008705003 -0500
++++ pnetcdf_mnist.py	2024-08-11 16:10:31.895471785 -0500
 @@ -1,3 +1,8 @@
 +#
 +# Copyright (C) 2024, Northwestern University and Argonne National Laboratory
@@ -15,10 +15,10 @@
  from torch.optim.lr_scheduler import StepLR
 +from torch.nn.parallel import DistributedDataParallel as DDP
 +from torch.utils.data.distributed import DistributedSampler
- 
+
 +import comm_file, pnetcdf_io
 +from mpi4py import MPI
- 
+
  class Net(nn.Module):
      def __init__(self):
 @@ -42,14 +51,13 @@
@@ -32,27 +32,27 @@
                  100. * batch_idx / len(train_loader), loss.item()))
              if args.dry_run:
                  break
- 
+
 -
  def test(model, device, test_loader):
      model.eval()
      test_loss = 0
 @@ -62,9 +70,14 @@
              pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
              correct += pred.eq(target.view_as(pred)).sum().item()
- 
+
 +    # aggregate loss among all ranks
 +    test_loss = comm.mpi_comm.allreduce(test_loss, op=MPI.SUM)
 +    correct = comm.mpi_comm.allreduce(correct, op=MPI.SUM)
 +
      test_loss /= len(test_loader.dataset)
- 
+
 -    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
 +    if rank == 0:
 +        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
          test_loss, correct, len(test_loader.dataset),
          100. * correct / len(test_loader.dataset)))
- 
+
 @@ -94,6 +107,8 @@
                          help='how many batches to wait before logging training status')
      parser.add_argument('--save-model', action='store_true', default=False,
@@ -65,7 +65,7 @@
 @@ -107,7 +122,7 @@
      else:
          device = torch.device("cpu")
- 
+
 -    train_kwargs = {'batch_size': args.batch_size}
 +    train_kwargs = {'batch_size': args.batch_size//nprocs}
      test_kwargs = {'batch_size': args.test_batch_size}
@@ -84,8 +84,8 @@
 +
 +    # Open files storing training and testing samples
 +    infile = args.input_file
-+    train_file = pnetcdf_io.dataset(infile, 'train_images', 'train_labels', transform, comm.mpi_comm)
-+    test_file = pnetcdf_io.dataset(infile, 'test_images', 'test_labels', transform, comm.mpi_comm)
++    train_file = pnetcdf_io.dataset(infile, 'train_samples', 'train_labels', transform, comm.mpi_comm)
++    test_file = pnetcdf_io.dataset(infile, 'test_samples', 'test_labels', transform, comm.mpi_comm)
 +
 +    # create distributed samplers
 +    train_sampler = DistributedSampler(train_file, num_replicas=nprocs, rank=rank, shuffle=True)
@@ -94,14 +94,14 @@
 +    # add distributed samplers to DataLoaders
 +    train_loader = torch.utils.data.DataLoader(train_file, sampler=train_sampler, **train_kwargs)
 +    test_loader = torch.utils.data.DataLoader(test_file, sampler=test_sampler, **test_kwargs, drop_last=False)
- 
+
      model = Net().to(device)
 +
 +    # use DDP
 +    model = DDP(model, device_ids=[device] if use_cuda else None)
 +
      optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
- 
+
      scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
      for epoch in range(1, args.epochs + 1):
 +        # train sampler set epoch
@@ -111,16 +111,16 @@
          train(args, model, device, train_loader, optimizer, epoch)
          test(model, device, test_loader)
          scheduler.step()
- 
+
      if args.save_model:
 -        torch.save(model.state_dict(), "mnist_cnn.pt")
 +        if rank == 0:
 +            torch.save(model.state_dict(), "mnist_cnn.pt")
- 
+
 +    # close files
 +    train_file.close()
 +    test_file.close()
- 
+
  if __name__ == '__main__':
 +    ## initialize parallel environment
 +    comm, device = comm_file.init_parallel()
diff --git a/examples/Makefile b/examples/Makefile
@@ -61,4 +61,6 @@ ptest8:
 
 clean:
 	rm -rf ${OUTPUT_DIR}
+	cd Pytorch_DDP && make clean
+	cd MNIST && make clean