Skip to content

Commit 995a2b1

Browse files
authored
Merge pull request #43 from KWang1998/ml_examples
Add example program of MNIST using pytorch DDP
2 parents 0157cc3 + d24be5e commit 995a2b1

File tree

14 files changed

+786
-16
lines changed

14 files changed

+786
-16
lines changed

.github/workflows/pnetcdf_c_master.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ jobs:
8383
pip install numpy cython cftime pytest twine wheel check-manifest
8484
export MPICC=$MPICH_DIR/bin/mpicc
8585
pip install mpi4py
86-
pip install torch
86+
pip install torch torchvision
8787
8888
- name: Install PnetCDF-Python
8989
run: |

.github/workflows/pnetcdf_c_official.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ jobs:
8181
pip install numpy cython cftime pytest twine wheel check-manifest
8282
export MPICC=$MPICH_DIR/bin/mpicc
8383
pip install mpi4py
84-
pip install torch
84+
pip install torch torchvision
8585
8686
- name: Install PnetCDF-Python
8787
run: |

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
# See COPYRIGHT notice in top-level directory.
44
#
55

6+
all:
7+
68
check:
79
cd test && make check
810
cd examples && make check
@@ -32,3 +34,5 @@ build-clean: clean
3234
install-clean: build-clean
3335
rm -rf dist
3436

37+
.PHONY: all check ptests clean build-clean install-clean
38+

examples/MNIST/Makefile

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#
2+
# Copyright (C) 2024, Northwestern University and Argonne National Laboratory
3+
# See COPYRIGHT notice in top-level directory.
4+
#
5+
6+
check_PROGRAMS = mnist_main.py
7+
8+
MNIST_URL = https://raw.githubusercontent.com/pytorch/examples/main/mnist/main.py
9+
10+
all:
11+
12+
mnist_main.py:
13+
@curl -Ls $(MNIST_URL) -o $@
14+
@patch -st $@ < mnist.patch
15+
16+
MNIST_DATA_URL = https://yann.lecun.com/exdb/mnist
17+
18+
MNIST_DATASETS = train-images-idx3-ubyte \
19+
train-labels-idx1-ubyte \
20+
t10k-images-idx3-ubyte \
21+
t10k-labels-idx1-ubyte
22+
23+
MNIST_DATASETS_GZ = $(MNIST_DATASETS:=.gz)
24+
25+
train-images-idx3-ubyte:
26+
@curl -LOsf $(MNIST_DATA_URL)/$@.gz
27+
@gunzip $@.gz
28+
29+
train-labels-idx1-ubyte:
30+
@curl -LOsf $(MNIST_DATA_URL)/$@.gz
31+
@gunzip $@.gz
32+
33+
t10k-images-idx3-ubyte:
34+
@curl -LOsf $(MNIST_DATA_URL)/$@.gz
35+
@gunzip $@.gz
36+
37+
t10k-labels-idx1-ubyte:
38+
@curl -LOsf $(MNIST_DATA_URL)/$@.gz
39+
@gunzip $@.gz
40+
41+
mnist_images.nc: $(MNIST_DATASETS)
42+
@python create_mnist_netcdf.py
43+
44+
ptests check: mnist_main.py
45+
@echo "======================================================================"
46+
@echo " examples/MNIST: Parallel testing on 4 MPI processes"
47+
@echo "======================================================================"
48+
@mpiexec -n 4 python mnist_main.py --batch-size 4 --test-batch-size 2 --epochs 3 --input-file mnist_images.nc
49+
@echo ""
50+
51+
clean:
52+
rm -f mnist_main.py
53+
rm -f $(MNIST_DATASETS)
54+
rm -f $(MNIST_DATASETS_GZ)
55+
56+
.PHONY: all check ptests clean
57+

examples/MNIST/README.md

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# MNIST example using PnetCDF-Python to Read Input Data
2+
3+
This directory contains files for running the Pytorch example program,
4+
[MNIST](https://github.com/pytorch/examples/tree/main/mnist),
5+
using Pytorch module `DistributedDataParallel` for parallel training and
6+
`PnetCDF-Python` for reading data from a NetCDF files.
7+
8+
## Running the MNIST Example Program
9+
10+
* Firstly, run command below to generate the python program file.
11+
```sh
12+
make mnist_main.py
13+
```
14+
* Run command below to train the model using 4 MPI processes.
15+
```sh
16+
mpiexec -n 4 python mnist_main.py --batch-size 4 --test-batch-size 2 --epochs 3 --input-file mnist_images.nc
17+
```
18+
19+
* `mnist_main.py` command-line options
20+
```
21+
-h, --help show this help message and exit
22+
--batch-size N input batch size for training (default: 64)
23+
--test-batch-size N input batch size for testing (default: 1000)
24+
--epochs N number of epochs to train (default: 14)
25+
--lr LR learning rate (default: 1.0)
26+
--gamma M Learning rate step gamma (default: 0.7)
27+
--no-cuda disables CUDA training
28+
--no-mps disables macOS GPU training
29+
--dry-run quickly check a single pass
30+
--seed S random seed (default: 1)
31+
--log-interval N how many batches to wait before logging training status
32+
--save-model For Saving the current Model
33+
--input-file INPUT_FILE
34+
NetCDF file storing train and test samples
35+
```
36+
37+
## Testing
38+
* Command `make check` will do the following.
39+
+ Downloads the python source codes
40+
[main.py](https://github.com/pytorch/examples/blob/main/mnist/main.py)
41+
from [Pytorch Examples](https://github.com/pytorch/examples) as file
42+
`mnist_main.py`.
43+
+ Applies patch file [mnist.patch](./mnist.patch) to `mnist_main.py`.
44+
+ Run the training program `mnist_main.py` in parallel using 4 MPI processes.
45+
46+
* Testing output shown on screen.
47+
```
48+
=====================================================================
49+
examples/MNIST: Parallel testing on 4 MPI processes
50+
======================================================================
51+
Train Epoch: 1 [0/60 (0%)] Loss: 2.514259
52+
Train Epoch: 1 [10/60 (67%)] Loss: 1.953820
53+
54+
Test set: Average loss: 2.2113, Accuracy: 4/12 (33%)
55+
56+
Train Epoch: 2 [0/60 (0%)] Loss: 2.359334
57+
Train Epoch: 2 [10/60 (67%)] Loss: 2.092178
58+
59+
Test set: Average loss: 1.4825, Accuracy: 6/12 (50%)
60+
61+
Train Epoch: 3 [0/60 (0%)] Loss: 2.067438
62+
Train Epoch: 3 [10/60 (67%)] Loss: 0.010670
63+
64+
Test set: Average loss: 1.2531, Accuracy: 7/12 (58%)
65+
```
66+
67+
## Generate the Input NetCDF File From MNIST Datasets
68+
* Utility program [create_mnist_netcdf.py](./create_mnist_netcdf.py)
69+
can be used to extract a subset of images into a NetCDF file.
70+
* Command `make mnist_images.nc` will first download the MNIST data files from
71+
https://yann.lecun.com/exdb/mnist and extract 60 images as training samples
72+
and 12 images as testing samples into a new file named `mnist_images.nc`.
73+
* `create_mnist_netcdf.py` can also run individually to extract a different
74+
number of images using command-line options shown below.
75+
* `create_mnist_netcdf.py` command-line options:
76+
```
77+
-h, --help show this help message and exit
78+
--verbose Verbose mode
79+
--train-size N Number of training samples extracted from the input file (default: 60)
80+
--test-size N Number of testing samples extracted from the input file (default: 12)
81+
--train-data-file TRAIN_DATA_FILE
82+
(Optional) input file name of training data
83+
--train-label-file TRAIN_LABEL_FILE
84+
(Optional) input file name of training labels
85+
--test-data-file TEST_DATA_FILE
86+
(Optional) input file name of testing data
87+
--test-label-file TEST_LABEL_FILE
88+
(Optional) input file name of testing labels
89+
--out-file OUT_FILE (Optional) output NetCDF file name
90+
```
91+
* The NetCDF file metadata can be obtained by running command "ncmpidump -h" or
92+
"ncdump -h".
93+
```sh
94+
% ncmpidump -h mnist_images.nc
95+
netcdf mnist_images {
96+
// file format: CDF-5 (big variables)
97+
dimensions:
98+
height = 28 ;
99+
width = 28 ;
100+
train_num = 60 ;
101+
test_num = 12 ;
102+
variables:
103+
ubyte train_samples(train_num, height, width) ;
104+
train_samples:long_name = "training data samples" ;
105+
ubyte train_labels(train_num) ;
106+
train_labels:long_name = "labels of training samples" ;
107+
ubyte test_samples(test_num, height, width) ;
108+
test_samples:long_name = "testing data samples" ;
109+
ubyte test_labels(test_num) ;
110+
test_labels:long_name = "labels of testing samples" ;
111+
112+
// global attributes:
113+
:url = "https://yann.lecun.com/exdb/mnist/" ;
114+
}
115+
```
116+
117+
## Files in this directory
118+
* [mnist.patch](./mnist.patch) --
119+
a patch file to be applied on
120+
[main.py](https://github.com/pytorch/examples/blob/main/mnist/main.py)
121+
once downloaded from [Pytorch Examples](https://github.com/pytorch/examples)
122+
before running the model training.
123+
124+
* [comm_file.py](./comm_file.py) --
125+
implements the parallel environment for training the model in parallel.
126+
127+
* [pnetcdf_io.py](./pnetcdf_io.py) --
128+
implements the file I/O using PnetCDF-Python.
129+
130+
* [create_mnist_netcdf.py](./create_mnist_netcdf.py) --
131+
a utility python program that reads the MINST files, extract a subset of the
132+
samples, and stores them into a newly created file in NetCDF format.
133+
134+
### Notes:
135+
- The test set accuracy may vary slightly depending on how the data is distributed across the MPI processes.
136+
- The accuracy and loss reported after each epoch are averaged across all MPI processes.
137+

0 commit comments

Comments
 (0)