Skip to content

Commit cbe274c

Browse files
authored
Merge branch 'main' into executorch-redirects
2 parents 8b1d466 + 33753d3 commit cbe274c

36 files changed

+334
-2511
lines changed

.jenkins/validate_tutorials_built.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
NOT_RUN = [
1212
"beginner_source/basics/intro", # no code
13+
"beginner_source/introyt/introyt_index", # no code
1314
"beginner_source/onnx/intro_onnx",
1415
"beginner_source/profiler",
1516
"beginner_source/saving_loading_models",

.lycheeignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,6 @@ file:///f:/libtmp/some_file
66

77
#Ignore links with "file:///" to catch any other example links
88
file:\/\/\/.*
9+
10+
# Ignore colab link in the setting of conf.py
11+
https://pytorch.org/tutorials/beginner/colab/n

_static/css/custom2.css

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,21 @@
1717
margin-bottom: 5px;
1818
}
1919
}
20+
21+
/* Left nav for 2nd level nav */
22+
23+
.pytorch-left-menu li.toctree-l2 {
24+
padding-left: 10px;
25+
}
26+
27+
.pytorch-left-menu li.toctree-l2.current > a, {
28+
color: #ee4c2c;
29+
}
30+
31+
.pytorch-left-menu li.toctree-l2.current a:link.reference.internal {
32+
color: #ee4c2c;
33+
}
34+
35+
.pytorch-left-menu li.toctree-l1.current > a:before {
36+
content: "";
37+
}

_static/js/custom.js

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
document.addEventListener("DOMContentLoaded", function() {
2+
// Select all <li> elements with the class "toctree-l1"
3+
var toctreeItems = document.querySelectorAll('li.toctree-l1');
4+
5+
toctreeItems.forEach(function(item) {
6+
// Find the link within the item
7+
var link = item.querySelector('a');
8+
var nestedList = item.querySelector('ul');
9+
10+
if (link && nestedList) {
11+
// Create a span element for the "[+]" or "[-]" sign
12+
var expandSign = document.createElement('span');
13+
expandSign.style.cursor = 'pointer'; // Make it look clickable
14+
15+
// Use the link text as a unique key for localStorage
16+
var sectionKey = 'section_' + link.textContent.trim().replace(/\s+/g, '_');
17+
18+
// Retrieve the saved state from localStorage
19+
var isExpanded = localStorage.getItem(sectionKey);
20+
21+
// If no state is saved, default to expanded for "Learn the Basics" and collapsed for others
22+
if (isExpanded === null) {
23+
isExpanded = (link.textContent.trim() === 'Learn the Basics') ? 'true' : 'false';
24+
localStorage.setItem(sectionKey, isExpanded);
25+
}
26+
27+
if (isExpanded === 'true') {
28+
nestedList.style.display = 'block'; // Expand the section
29+
expandSign.textContent = '[-] '; // Show "[-]" since it's expanded
30+
} else {
31+
nestedList.style.display = 'none'; // Collapse the section
32+
expandSign.textContent = '[+] '; // Show "[+]" since it's collapsed
33+
}
34+
35+
// Add a click event to toggle the nested list
36+
expandSign.addEventListener('click', function() {
37+
if (nestedList.style.display === 'none') {
38+
nestedList.style.display = 'block';
39+
expandSign.textContent = '[-] '; // Change to "[-]" when expanded
40+
localStorage.setItem(sectionKey, 'true'); // Save state
41+
} else {
42+
nestedList.style.display = 'none';
43+
expandSign.textContent = '[+] '; // Change back to "[+]" when collapsed
44+
localStorage.setItem(sectionKey, 'false'); // Save state
45+
}
46+
});
47+
48+
// Insert the sign before the link
49+
link.parentNode.insertBefore(expandSign, link);
50+
}
51+
});
52+
});

_templates/layout.html

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,23 @@
11
{% extends "!layout.html" %}
22

3+
4+
<!-- Overrides needed for the multilevel nav -->
5+
{% block menu %}
6+
{% if 'singlehtml' not in builder %}
7+
{% set global_toc = toctree(collapse=theme_collapse_navigation|tobool,
8+
includehidden=theme_includehidden|tobool,
9+
titles_only=True) %}
10+
{% endif %}
11+
{% if global_toc %}
12+
{{ global_toc }}
13+
{% else %}
14+
<!-- Local TOC -->
15+
<div class="local-toc">{{ toc }}</div>
16+
{% endif %}
17+
{% endblock %}
18+
<!-- End of overrides needed for the multilevel nav -->
19+
20+
321
{%- block content %}
422
{{ super() }}
523
<script>
@@ -29,6 +47,7 @@
2947
</div>
3048
{% endblock %}
3149

50+
3251
{% block footer %}
3352
{{ super() }}
3453
<script>

beginner_source/basics/intro.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313
===================
1414
1515
Authors:
16-
`Suraj Subramanian <https://github.com/suraj813>`_,
16+
`Suraj Subramanian <https://github.com/subramen>`_,
1717
`Seth Juarez <https://github.com/sethjuarez/>`_,
18-
`Cassie Breviu <https://github.com/cassieview/>`_,
18+
`Cassie Breviu <https://github.com/cassiebreviu/>`_,
1919
`Dmitry Soshnikov <https://soshnikov.com/>`_,
2020
`Ari Bornstein <https://github.com/aribornstein/>`_
2121
@@ -49,6 +49,16 @@
4949
.. include:: /beginner_source/basics/qs_toc.txt
5050
5151
.. toctree::
52+
:maxdepth: 2
5253
:hidden:
5354
55+
quickstart_tutorial
56+
tensorqs_tutorial
57+
data_tutorial
58+
transforms_tutorial
59+
buildmodel_tutorial
60+
autogradqs_tutorial
61+
optimization_tutorial
62+
saveloadrun_tutorial
63+
5464
"""

beginner_source/ddp_series_multigpu.rst

Lines changed: 66 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
Multi GPU training with DDP
1010
===========================
1111

12-
Authors: `Suraj Subramanian <https://github.com/suraj813>`__
12+
Authors: `Suraj Subramanian <https://github.com/subramen>`__
1313

1414
.. grid:: 2
1515

@@ -19,13 +19,13 @@ Authors: `Suraj Subramanian <https://github.com/suraj813>`__
1919
- How to migrate a single-GPU training script to multi-GPU via DDP
2020
- Setting up the distributed process group
2121
- Saving and loading models in a distributed setup
22-
22+
2323
.. grid:: 1
2424

2525
.. grid-item::
2626

2727
:octicon:`code-square;1.0em;` View the code used in this tutorial on `GitHub <https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu.py>`__
28-
28+
2929
.. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
3030
:class-card: card-prerequisites
3131

@@ -45,11 +45,11 @@ In the `previous tutorial <ddp_series_theory.html>`__, we got a high-level overv
4545
In this tutorial, we start with a single-GPU training script and migrate that to running it on 4 GPUs on a single node.
4646
Along the way, we will talk through important concepts in distributed training while implementing them in our code.
4747

48-
.. note::
48+
.. note::
4949
If your model contains any ``BatchNorm`` layers, it needs to be converted to ``SyncBatchNorm`` to sync the running stats of ``BatchNorm``
5050
layers across replicas.
5151

52-
Use the helper function
52+
Use the helper function
5353
`torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) <https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html#torch.nn.SyncBatchNorm.convert_sync_batchnorm>`__ to convert all ``BatchNorm`` layers in the model to ``SyncBatchNorm``.
5454

5555

@@ -58,27 +58,27 @@ Diff for `single_gpu.py <https://github.com/pytorch/examples/blob/main/distribut
5858
These are the changes you typically make to a single-GPU training script to enable DDP.
5959

6060
Imports
61-
~~~~~~~
61+
-------
6262
- ``torch.multiprocessing`` is a PyTorch wrapper around Python's native
6363
multiprocessing
6464
- The distributed process group contains all the processes that can
6565
communicate and synchronize with each other.
6666

67-
.. code-block:: diff
67+
.. code-block:: python
6868
69-
import torch
70-
import torch.nn.functional as F
71-
from utils import MyTrainDataset
69+
import torch
70+
import torch.nn.functional as F
71+
from utils import MyTrainDataset
7272
73-
+ import torch.multiprocessing as mp
74-
+ from torch.utils.data.distributed import DistributedSampler
75-
+ from torch.nn.parallel import DistributedDataParallel as DDP
76-
+ from torch.distributed import init_process_group, destroy_process_group
77-
+ import os
73+
import torch.multiprocessing as mp
74+
from torch.utils.data.distributed import DistributedSampler
75+
from torch.nn.parallel import DistributedDataParallel as DDP
76+
from torch.distributed import init_process_group, destroy_process_group
77+
import os
7878
7979
8080
Constructing the process group
81-
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
81+
------------------------------
8282

8383
- First, before initializing the group process, call `set_device <https://pytorch.org/docs/stable/generated/torch.cuda.set_device.html?highlight=set_device#torch.cuda.set_device>`__,
8484
which sets the default GPU for each process. This is important to prevent hangs or excessive memory utilization on `GPU:0`
@@ -90,66 +90,66 @@ Constructing the process group
9090
- Read more about `choosing a DDP
9191
backend <https://pytorch.org/docs/stable/distributed.html#which-backend-to-use>`__
9292

93-
.. code-block:: diff
93+
.. code-block:: python
9494
95-
+ def ddp_setup(rank: int, world_size: int):
96-
+ """
97-
+ Args:
98-
+ rank: Unique identifier of each process
99-
+ world_size: Total number of processes
100-
+ """
101-
+ os.environ["MASTER_ADDR"] = "localhost"
102-
+ os.environ["MASTER_PORT"] = "12355"
103-
+ torch.cuda.set_device(rank)
104-
+ init_process_group(backend="nccl", rank=rank, world_size=world_size)
95+
def ddp_setup(rank: int, world_size: int):
96+
"""
97+
Args:
98+
rank: Unique identifier of each process
99+
world_size: Total number of processes
100+
"""
101+
os.environ["MASTER_ADDR"] = "localhost"
102+
os.environ["MASTER_PORT"] = "12355"
103+
torch.cuda.set_device(rank)
104+
init_process_group(backend="nccl", rank=rank, world_size=world_size)
105105
106106
107107
108108
Constructing the DDP model
109-
~~~~~~~~~~~~~~~~~~~~~~~~~~
109+
--------------------------
110110

111-
.. code-block:: diff
111+
.. code-block:: python
112112
113-
- self.model = model.to(gpu_id)
114-
+ self.model = DDP(model, device_ids=[gpu_id])
113+
self.model = DDP(model, device_ids=[gpu_id])
115114
116115
Distributing input data
117-
~~~~~~~~~~~~~~~~~~~~~~~
116+
-----------------------
118117

119118
- `DistributedSampler <https://pytorch.org/docs/stable/data.html?highlight=distributedsampler#torch.utils.data.distributed.DistributedSampler>`__
120119
chunks the input data across all distributed processes.
120+
- The `DataLoader <https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader>`__ combines a dataset and a
121+
sampler, and provides an iterable over the given dataset.
121122
- Each process will receive an input batch of 32 samples; the effective
122123
batch size is ``32 * nprocs``, or 128 when using 4 GPUs.
123124

124-
.. code-block:: diff
125+
.. code-block:: python
125126
126127
train_data = torch.utils.data.DataLoader(
127128
dataset=train_dataset,
128129
batch_size=32,
129-
- shuffle=True,
130-
+ shuffle=False,
131-
+ sampler=DistributedSampler(train_dataset),
130+
shuffle=False, # We don't shuffle
131+
sampler=DistributedSampler(train_dataset), # Use the Distributed Sampler here.
132132
)
133133
134-
- Calling the ``set_epoch()`` method on the ``DistributedSampler`` at the beginning of each epoch is necessary to make shuffling work
134+
- Calling the ``set_epoch()`` method on the ``DistributedSampler`` at the beginning of each epoch is necessary to make shuffling work
135135
properly across multiple epochs. Otherwise, the same ordering will be used in each epoch.
136136

137-
.. code-block:: diff
137+
.. code-block:: python
138138
139139
def _run_epoch(self, epoch):
140140
b_sz = len(next(iter(self.train_data))[0])
141-
+ self.train_data.sampler.set_epoch(epoch)
141+
self.train_data.sampler.set_epoch(epoch) # call this additional line at every epoch
142142
for source, targets in self.train_data:
143143
...
144144
self._run_batch(source, targets)
145145
146146
147147
Saving model checkpoints
148-
~~~~~~~~~~~~~~~~~~~~~~~~
149-
- We only need to save model checkpoints from one process. Without this
148+
------------------------
149+
- We only need to save model checkpoints from one process. Without this
150150
condition, each process would save its copy of the identical mode. Read
151151
more on saving and loading models with
152-
DDP `here <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html#save-and-load-checkpoints>`__
152+
DDP `here <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html#save-and-load-checkpoints>`__
153153

154154
.. code-block:: diff
155155
@@ -164,18 +164,18 @@ Saving model checkpoints
164164
.. warning::
165165
`Collective calls <https://pytorch.org/docs/stable/distributed.html#collective-functions>`__ are functions that run on all the distributed processes,
166166
and they are used to gather certain states or values to a specific process. Collective calls require all ranks to run the collective code.
167-
In this example, `_save_checkpoint` should not have any collective calls because it is only run on the ``rank:0`` process.
167+
In this example, `_save_checkpoint` should not have any collective calls because it is only run on the ``rank:0`` process.
168168
If you need to make any collective calls, it should be before the ``if self.gpu_id == 0`` check.
169169

170170

171171
Running the distributed training job
172-
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
172+
------------------------------------
173173

174174
- Include new arguments ``rank`` (replacing ``device``) and
175175
``world_size``.
176176
- ``rank`` is auto-allocated by DDP when calling
177177
`mp.spawn <https://pytorch.org/docs/stable/multiprocessing.html#spawning-subprocesses>`__.
178-
- ``world_size`` is the number of processes across the training job. For GPU training,
178+
- ``world_size`` is the number of processes across the training job. For GPU training,
179179
this corresponds to the number of GPUs in use, and each process works on a dedicated GPU.
180180

181181
.. code-block:: diff
@@ -189,7 +189,7 @@ Running the distributed training job
189189
+ trainer = Trainer(model, train_data, optimizer, rank, save_every)
190190
trainer.train(total_epochs)
191191
+ destroy_process_group()
192-
192+
193193
if __name__ == "__main__":
194194
import sys
195195
total_epochs = int(sys.argv[1])
@@ -199,13 +199,31 @@ Running the distributed training job
199199
+ world_size = torch.cuda.device_count()
200200
+ mp.spawn(main, args=(world_size, total_epochs, save_every,), nprocs=world_size)
201201
202+
Here's what the code looks like:
203+
204+
.. code-block:: python
205+
def main(rank, world_size, total_epochs, save_every):
206+
ddp_setup(rank, world_size)
207+
dataset, model, optimizer = load_train_objs()
208+
train_data = prepare_dataloader(dataset, batch_size=32)
209+
trainer = Trainer(model, train_data, optimizer, rank, save_every)
210+
trainer.train(total_epochs)
211+
destroy_process_group()
212+
213+
if __name__ == "__main__":
214+
import sys
215+
total_epochs = int(sys.argv[1])
216+
save_every = int(sys.argv[2])
217+
world_size = torch.cuda.device_count()
218+
mp.spawn(main, args=(world_size, total_epochs, save_every,), nprocs=world_size)
219+
202220
203221
204222
Further Reading
205223
---------------
206224

207225
- `Fault Tolerant distributed training <ddp_series_fault_tolerance.html>`__ (next tutorial in this series)
208226
- `Intro to DDP <ddp_series_theory.html>`__ (previous tutorial in this series)
209-
- `Getting Started with DDP <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`__
227+
- `Getting Started with DDP <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`__
210228
- `Process Group
211-
initialization <https://pytorch.org/docs/stable/distributed.html#tcp-initialization>`__
229+
Initialization <https://pytorch.org/docs/stable/distributed.html#tcp-initialization>`__

0 commit comments

Comments
 (0)