pytorch
diff --git a/‎.jenkins/validate_tutorials_built.py
Lines changed: 1 addition & 0 deletions b/‎.jenkins/validate_tutorials_built.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎.lycheeignore
Lines changed: 3 additions & 0 deletions b/‎.lycheeignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎_static/css/custom2.css
Lines changed: 18 additions & 0 deletions b/‎_static/css/custom2.css
Lines changed: 18 additions & 0 deletions
diff --git a/‎_static/js/custom.js
Lines changed: 52 additions & 0 deletions b/‎_static/js/custom.js
Lines changed: 52 additions & 0 deletions
diff --git a/‎_templates/layout.html
Lines changed: 19 additions & 0 deletions b/‎_templates/layout.html
Lines changed: 19 additions & 0 deletions
diff --git a/‎beginner_source/basics/intro.py
Lines changed: 12 additions & 2 deletions b/‎beginner_source/basics/intro.py
Lines changed: 12 additions & 2 deletions
diff --git a/‎beginner_source/ddp_series_multigpu.rst
Lines changed: 66 additions & 48 deletions b/‎beginner_source/ddp_series_multigpu.rst
Lines changed: 66 additions & 48 deletions
@@ -10,6 +10,7 @@
 
 NOT_RUN = [
     "beginner_source/basics/intro",  # no code
+    "beginner_source/introyt/introyt_index", # no code
     "beginner_source/onnx/intro_onnx",
     "beginner_source/profiler",
     "beginner_source/saving_loading_models",
 
@@ -6,3 +6,6 @@ file:///f:/libtmp/some_file
 
 #Ignore links with "file:///" to catch any other example links
 file:\/\/\/.*
+
+# Ignore colab link in the setting of conf.py
+https://pytorch.org/tutorials/beginner/colab/n
@@ -17,3 +17,21 @@
     margin-bottom: 5px;
   }
 }
+
+/* Left nav for 2nd level nav */
+
+.pytorch-left-menu li.toctree-l2 {
+  padding-left: 10px;
+}
+
+.pytorch-left-menu li.toctree-l2.current > a, {
+   color: #ee4c2c;
+}
+
+.pytorch-left-menu li.toctree-l2.current a:link.reference.internal {
+   color: #ee4c2c;
+}
+
+.pytorch-left-menu li.toctree-l1.current > a:before {
+   content: "";
+}
@@ -0,0 +1,52 @@
+document.addEventListener("DOMContentLoaded", function() {
+  // Select all <li> elements with the class "toctree-l1"
+  var toctreeItems = document.querySelectorAll('li.toctree-l1');
+
+  toctreeItems.forEach(function(item) {
+    // Find the link within the item
+    var link = item.querySelector('a');
+    var nestedList = item.querySelector('ul');
+
+    if (link && nestedList) {
+      // Create a span element for the "[+]" or "[-]" sign
+      var expandSign = document.createElement('span');
+      expandSign.style.cursor = 'pointer'; // Make it look clickable
+
+      // Use the link text as a unique key for localStorage
+      var sectionKey = 'section_' + link.textContent.trim().replace(/\s+/g, '_');
+
+      // Retrieve the saved state from localStorage
+      var isExpanded = localStorage.getItem(sectionKey);
+
+      // If no state is saved, default to expanded for "Learn the Basics" and collapsed for others
+      if (isExpanded === null) {
+        isExpanded = (link.textContent.trim() === 'Learn the Basics') ? 'true' : 'false';
+        localStorage.setItem(sectionKey, isExpanded);
+      }
+
+      if (isExpanded === 'true') {
+        nestedList.style.display = 'block'; // Expand the section
+        expandSign.textContent = '[-] '; // Show "[-]" since it's expanded
+      } else {
+        nestedList.style.display = 'none'; // Collapse the section
+        expandSign.textContent = '[+] '; // Show "[+]" since it's collapsed
+      }
+
+      // Add a click event to toggle the nested list
+      expandSign.addEventListener('click', function() {
+        if (nestedList.style.display === 'none') {
+          nestedList.style.display = 'block';
+          expandSign.textContent = '[-] '; // Change to "[-]" when expanded
+          localStorage.setItem(sectionKey, 'true'); // Save state
+        } else {
+          nestedList.style.display = 'none';
+          expandSign.textContent = '[+] '; // Change back to "[+]" when collapsed
+          localStorage.setItem(sectionKey, 'false'); // Save state
+        }
+      });
+
+      // Insert the sign before the link
+      link.parentNode.insertBefore(expandSign, link);
+    }
+  });
+});
@@ -1,5 +1,23 @@
 {% extends "!layout.html" %}
 
+
+<!-- Overrides needed for the multilevel nav -->
+{% block menu %}
+    {% if 'singlehtml' not in builder %}
+        {% set global_toc = toctree(collapse=theme_collapse_navigation|tobool,
+                                    includehidden=theme_includehidden|tobool,
+                                    titles_only=True) %}
+    {% endif %}
+    {% if global_toc %}
+        {{ global_toc }}
+    {% else %}
+        <!-- Local TOC -->
+        <div class="local-toc">{{ toc }}</div>
+    {% endif %}
+{% endblock %}
+ <!-- End of overrides needed for the multilevel nav -->
+
+
 {%- block content %}
 {{ super() }}
 <script>
@@ -29,6 +47,7 @@
     </div>
 {% endblock %}
 
+
 {% block footer %}
 {{ super() }}
 <script>
 
@@ -13,9 +13,9 @@
 ===================
 
 Authors:
-`Suraj Subramanian <https://github.com/suraj813>`_,
+`Suraj Subramanian <https://github.com/subramen>`_,
 `Seth Juarez <https://github.com/sethjuarez/>`_,
-`Cassie Breviu <https://github.com/cassieview/>`_,
+`Cassie Breviu <https://github.com/cassiebreviu/>`_,
 `Dmitry Soshnikov <https://soshnikov.com/>`_,
 `Ari Bornstein <https://github.com/aribornstein/>`_
 
@@ -49,6 +49,16 @@
 .. include:: /beginner_source/basics/qs_toc.txt
 
 .. toctree::
+   :maxdepth: 2
    :hidden:
 
+   quickstart_tutorial
+   tensorqs_tutorial
+   data_tutorial
+   transforms_tutorial
+   buildmodel_tutorial
+   autogradqs_tutorial
+   optimization_tutorial
+   saveloadrun_tutorial
+
 """
@@ -9,7 +9,7 @@
 Multi GPU training with DDP
 ===========================
 
-Authors: `Suraj Subramanian <https://github.com/suraj813>`__
+Authors: `Suraj Subramanian <https://github.com/subramen>`__
 
 .. grid:: 2
 
@@ -19,13 +19,13 @@ Authors: `Suraj Subramanian <https://github.com/suraj813>`__
       -  How to migrate a single-GPU training script to multi-GPU via DDP
       -  Setting up the distributed process group
       -  Saving and loading models in a distributed setup
-      
+
       .. grid:: 1
 
          .. grid-item::
 
             :octicon:`code-square;1.0em;` View the code used in this tutorial on `GitHub <https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu.py>`__
-      
+
    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
       :class-card: card-prerequisites
 
@@ -45,11 +45,11 @@ In the `previous tutorial <ddp_series_theory.html>`__, we got a high-level overv
 In this tutorial, we start with a single-GPU training script and migrate that to running it on 4 GPUs on a single node.
 Along the way, we will talk through important concepts in distributed training while implementing them in our code.
 
-.. note:: 
+.. note::
    If your model contains any ``BatchNorm`` layers, it needs to be converted to ``SyncBatchNorm`` to sync the running stats of ``BatchNorm``
    layers across replicas.
 
-   Use the helper function 
+   Use the helper function
    `torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) <https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html#torch.nn.SyncBatchNorm.convert_sync_batchnorm>`__ to convert all ``BatchNorm`` layers in the model to ``SyncBatchNorm``.
 
 
@@ -58,27 +58,27 @@ Diff for `single_gpu.py <https://github.com/pytorch/examples/blob/main/distribut
 These are the changes you typically make to a single-GPU training script to enable DDP.
 
 Imports
-~~~~~~~
+-------
 -  ``torch.multiprocessing`` is a PyTorch wrapper around Python's native
    multiprocessing
 -  The distributed process group contains all the processes that can
    communicate and synchronize with each other.
 
-.. code-block:: diff
+.. code-block:: python
 
-    import torch
-    import torch.nn.functional as F
-    from utils import MyTrainDataset
+   import torch
+   import torch.nn.functional as F
+   from utils import MyTrainDataset
 
-    + import torch.multiprocessing as mp
-    + from torch.utils.data.distributed import DistributedSampler
-    + from torch.nn.parallel import DistributedDataParallel as DDP
-    + from torch.distributed import init_process_group, destroy_process_group
-    + import os
+   import torch.multiprocessing as mp
+   from torch.utils.data.distributed import DistributedSampler
+   from torch.nn.parallel import DistributedDataParallel as DDP
+   from torch.distributed import init_process_group, destroy_process_group
+   import os
 
 
 Constructing the process group
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+------------------------------
 
 -  First, before initializing the group process, call `set_device <https://pytorch.org/docs/stable/generated/torch.cuda.set_device.html?highlight=set_device#torch.cuda.set_device>`__,
    which sets the default GPU for each process. This is important to prevent hangs or excessive memory utilization on `GPU:0`
@@ -90,66 +90,66 @@ Constructing the process group
 -  Read more about `choosing a DDP
    backend <https://pytorch.org/docs/stable/distributed.html#which-backend-to-use>`__
 
-.. code-block:: diff
+.. code-block:: python
 
-    + def ddp_setup(rank: int, world_size: int):
-    +   """
-    +   Args:
-    +       rank: Unique identifier of each process
-    +      world_size: Total number of processes
-    +   """
-    +   os.environ["MASTER_ADDR"] = "localhost"
-    +   os.environ["MASTER_PORT"] = "12355"
-    +   torch.cuda.set_device(rank)
-    +   init_process_group(backend="nccl", rank=rank, world_size=world_size)
+   def ddp_setup(rank: int, world_size: int):
+      """
+      Args:
+          rank: Unique identifier of each process
+         world_size: Total number of processes
+      """
+      os.environ["MASTER_ADDR"] = "localhost"
+      os.environ["MASTER_PORT"] = "12355"
+      torch.cuda.set_device(rank)
+      init_process_group(backend="nccl", rank=rank, world_size=world_size)
 
 
 
 Constructing the DDP model
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+--------------------------
 
-.. code-block:: diff
+.. code-block:: python
 
-    - self.model = model.to(gpu_id)
-    + self.model = DDP(model, device_ids=[gpu_id])
+   self.model = DDP(model, device_ids=[gpu_id])
 
 Distributing input data
-~~~~~~~~~~~~~~~~~~~~~~~
+-----------------------
 
 -  `DistributedSampler <https://pytorch.org/docs/stable/data.html?highlight=distributedsampler#torch.utils.data.distributed.DistributedSampler>`__
    chunks the input data across all distributed processes.
+- The `DataLoader <https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader>`__ combines a dataset and a
+   sampler, and provides an iterable over the given dataset.
 -  Each process will receive an input batch of 32 samples; the effective
    batch size is ``32 * nprocs``, or 128 when using 4 GPUs.
 
-.. code-block:: diff
+.. code-block:: python
 
     train_data = torch.utils.data.DataLoader(
         dataset=train_dataset,
         batch_size=32,
-    -   shuffle=True,
-    +   shuffle=False,
-    +   sampler=DistributedSampler(train_dataset),
+        shuffle=False,  # We don't shuffle
+        sampler=DistributedSampler(train_dataset), # Use the Distributed Sampler here.
     )
 
--  Calling the ``set_epoch()`` method on the ``DistributedSampler`` at the beginning of each epoch is necessary to make shuffling work 
+-  Calling the ``set_epoch()`` method on the ``DistributedSampler`` at the beginning of each epoch is necessary to make shuffling work
    properly across multiple epochs. Otherwise, the same ordering will be used in each epoch.
 
-.. code-block:: diff
+.. code-block:: python
 
     def _run_epoch(self, epoch):
         b_sz = len(next(iter(self.train_data))[0])
-    +   self.train_data.sampler.set_epoch(epoch)
+        self.train_data.sampler.set_epoch(epoch)   # call this additional line at every epoch
         for source, targets in self.train_data:
           ...
           self._run_batch(source, targets)
 
 
 Saving model checkpoints
-~~~~~~~~~~~~~~~~~~~~~~~~
--  We only need to save model checkpoints from one process. Without this 
+------------------------
+-  We only need to save model checkpoints from one process. Without this
    condition, each process would save its copy of the identical mode. Read
    more on saving and loading models with
-   DDP `here <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html#save-and-load-checkpoints>`__  
+   DDP `here <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html#save-and-load-checkpoints>`__
 
 .. code-block:: diff
 
@@ -164,18 +164,18 @@ Saving model checkpoints
 .. warning::
    `Collective calls <https://pytorch.org/docs/stable/distributed.html#collective-functions>`__ are functions that run on all the distributed processes,
    and they are used to gather certain states or values to a specific process. Collective calls require all ranks to run the collective code.
-   In this example, `_save_checkpoint` should not have any collective calls because it is only run on the ``rank:0`` process. 
+   In this example, `_save_checkpoint` should not have any collective calls because it is only run on the ``rank:0`` process.
    If you need to make any collective calls, it should be before the ``if self.gpu_id == 0`` check.
 
 
 Running the distributed training job
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+------------------------------------
 
 -  Include new arguments ``rank`` (replacing ``device``) and
    ``world_size``.
 -  ``rank`` is auto-allocated by DDP when calling
    `mp.spawn <https://pytorch.org/docs/stable/multiprocessing.html#spawning-subprocesses>`__.
--  ``world_size`` is the number of processes across the training job. For GPU training, 
+-  ``world_size`` is the number of processes across the training job. For GPU training,
    this corresponds to the number of GPUs in use, and each process works on a dedicated GPU.
 
 .. code-block:: diff
@@ -189,7 +189,7 @@ Running the distributed training job
    +  trainer = Trainer(model, train_data, optimizer, rank, save_every)
       trainer.train(total_epochs)
    +  destroy_process_group()
-    
+
    if __name__ == "__main__":
       import sys
       total_epochs = int(sys.argv[1])
@@ -199,13 +199,31 @@ Running the distributed training job
    +  world_size = torch.cuda.device_count()
    +  mp.spawn(main, args=(world_size, total_epochs, save_every,), nprocs=world_size)
 
+Here's what the code looks like:
+
+.. code-block:: python
+   def main(rank, world_size, total_epochs, save_every):
+      ddp_setup(rank, world_size)
+      dataset, model, optimizer = load_train_objs()
+      train_data = prepare_dataloader(dataset, batch_size=32)
+      trainer = Trainer(model, train_data, optimizer, rank, save_every)
+      trainer.train(total_epochs)
+      destroy_process_group()
+
+   if __name__ == "__main__":
+      import sys
+      total_epochs = int(sys.argv[1])
+      save_every = int(sys.argv[2])
+      world_size = torch.cuda.device_count()
+      mp.spawn(main, args=(world_size, total_epochs, save_every,), nprocs=world_size)
+
 
 
 Further Reading
 ---------------
 
 -  `Fault Tolerant distributed training <ddp_series_fault_tolerance.html>`__  (next tutorial in this series)
 -  `Intro to DDP <ddp_series_theory.html>`__ (previous tutorial in this series)
--  `Getting Started with DDP <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`__ 
+-  `Getting Started with DDP <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`__
 -  `Process Group
-   initialization <https://pytorch.org/docs/stable/distributed.html#tcp-initialization>`__
+   Initialization <https://pytorch.org/docs/stable/distributed.html#tcp-initialization>`__