Merge branch 'main' into fix-pgs

svekars · web-flow · commit 37694022a248 · 2024-11-15T11:29:59.000-08:00
diff --git a/.ci/docker/Dockerfile b/.ci/docker/Dockerfile
@@ -15,15 +15,11 @@ RUN bash ./install_user.sh && rm install_user.sh
 COPY ./common/install_docs_reqs.sh install_docs_reqs.sh
 RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
 
-# Install conda and other packages
-ENV ANACONDA_PYTHON_VERSION=3.10
-ENV CONDA_CMAKE yes
-ENV DOCS yes
-ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
-COPY ./requirements.txt /opt/conda/
-COPY ./common/install_conda.sh install_conda.sh
-COPY ./common/common_utils.sh common_utils.sh
-RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements.txt
+COPY ./common/install_pip_requirements.sh install_pip_requirements.sh
+COPY ./requirements.txt requirements.txt
+RUN bash ./install_pip_requirements.sh && rm install_pip_requirements.sh
+
+RUN ln -s /usr/bin/python3 /usr/bin/python
 
 USER ci-user
 CMD ["bash"]
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
@@ -10,7 +10,7 @@ set -exu
 IMAGE_NAME="$1"
 shift
 
-export UBUNTU_VERSION="20.04"
+export UBUNTU_VERSION="22.04"
 export CUDA_VERSION="12.4.1"
 
 export BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
diff --git a/.ci/docker/common/common_utils.sh b/.ci/docker/common/common_utils.sh
diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh
@@ -10,7 +10,7 @@ install_ubuntu() {
   apt-get install -y --no-install-recommends \
     build-essential \
     ca-certificates \
-    cmake=3.16* \
+    cmake=3.22* \
     curl \
     git \
     wget \
@@ -27,7 +27,9 @@ install_ubuntu() {
     libglfw3-dev \
     sox \
     libsox-dev \
-    libsox-fmt-all
+    libsox-fmt-all \
+    python3-pip \
+    python3-dev
 
   # Cleanup package manager
   apt-get autoclean && apt-get clean
diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
diff --git a/.ci/docker/common/install_pip_requirements.sh b/.ci/docker/common/install_pip_requirements.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+set -ex
+
+# Install pip packages
+pip install --upgrade pip
+pip install -r ./requirements.txt
diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json
@@ -33,7 +33,7 @@
   },
   "recipes_source/torch_export_aoti_python.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu"
-  }, 
+  },
   "advanced_source/pendulum.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu",
     "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run."
@@ -58,6 +58,9 @@
   "intermediate_source/scaled_dot_product_attention_tutorial.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu"
   },
+  "intermediate_source/transformer_building_blocks.py": {
+    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  },
   "recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu"
   },
diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py
@@ -25,6 +25,7 @@
     "intermediate_source/mnist_train_nas",  # used by ax_multiobjective_nas_tutorial.py
     "intermediate_source/fx_conv_bn_fuser",
     "intermediate_source/_torch_export_nightly_tutorial",  # does not work on release
+    "intermediate_source/transformer_building_blocks",  # does not work on release
     "advanced_source/super_resolution_with_onnxruntime",
     "advanced_source/usb_semisup_learn", # fails with CUDA OOM error, should try on a different worker
     "prototype_source/fx_graph_mode_ptq_dynamic",
diff --git a/beginner_source/ddp_series_fault_tolerance.rst b/beginner_source/ddp_series_fault_tolerance.rst
@@ -9,7 +9,7 @@
 Fault-tolerant Distributed Training with ``torchrun``
 =====================================================
 
-Authors: `Suraj Subramanian <https://github.com/suraj813>`__
+Authors: `Suraj Subramanian <https://github.com/subramen>`__
 
 .. grid:: 2
 
diff --git a/beginner_source/ddp_series_theory.rst b/beginner_source/ddp_series_theory.rst
@@ -7,7 +7,7 @@
 What is Distributed Data Parallel (DDP)
 =======================================
 
-Authors: `Suraj Subramanian <https://github.com/suraj813>`__
+Authors: `Suraj Subramanian <https://github.com/subramen>`__
 
 .. grid:: 2
 
diff --git a/en-wordlist.txt b/en-wordlist.txt
@@ -1,5 +1,6 @@
 ACL
 ADI
+ALiBi
 AOT
 AOTInductor
 APIs
@@ -79,6 +80,7 @@ FX
 FX's
 FairSeq
 Fastpath
+FFN
 FloydHub
 FloydHub's
 Frobenius
@@ -127,6 +129,7 @@ Kihyuk
 Kiuk
 Kubernetes
 Kuei
+KV
 LRSchedulers
 LSTM
 LSTMs
@@ -162,6 +165,7 @@ NLP
 NTK
 NUMA
 NaN
+NaNs
 NanoGPT
 Netron
 NeurIPS
@@ -231,6 +235,7 @@ Sigmoid
 SoTA
 Sohn
 Spacy
+SwiGLU
 TCP
 THP
 TIAToolbox
@@ -276,6 +281,7 @@ Xcode
 Xeon
 Yidong
 YouTube
+Zipf
 accelerometer
 accuracies
 activations
@@ -305,6 +311,7 @@ bbAP
 benchmarked
 benchmarking
 bitwise
+bool
 boolean
 breakpoint
 broadcasted
@@ -333,6 +340,7 @@ csv
 cuDNN
 cuda
 customizable
+customizations
 datafile
 dataflow
 dataframe
@@ -377,6 +385,7 @@ fbgemm
 feedforward
 finetune
 finetuning
+FlexAttention
 fp
 frontend
 functionalized
@@ -431,6 +440,7 @@ mAP
 macos
 manualSeed
 matmul
+matmuls
 matplotlib
 memcpy
 memset
@@ -446,6 +456,7 @@ modularized
 mpp
 mucosa
 multihead
+MultiheadAttention
 multimodal
 multimodality
 multinode
@@ -456,7 +467,11 @@ multithreading
 namespace
 natively
 ndarrays
+nheads
 nightlies
+NJT
+NJTs
+NJT's
 num
 numericalize
 numpy
@@ -532,6 +547,7 @@ runtime
 runtime
 runtimes
 scalable
+SDPA
 sharded
 softmax
 sparsified
@@ -591,12 +607,14 @@ tradeoff
 tradeoffs
 triton
 uint
+UX
 umap
 uncomment
 uncommented
 underflowing
 unfused
 unimodal
+unigram
 unnormalized
 unoptimized
 unparametrized
@@ -618,6 +636,7 @@ warmstarted
 warmstarting
 warmup
 webp
+wikitext
 wsi
 wsis
 Meta's
diff --git a/index.rst b/index.rst
@@ -664,6 +664,14 @@ Welcome to PyTorch Tutorials
    :link: beginner/knowledge_distillation_tutorial.html
    :tags: Model-Optimization,Image/Video
 
+
+.. customcarditem::
+   :header: Accelerating PyTorch Transformers by replacing nn.Transformer with Nested Tensors and torch.compile()
+   :card_description: This tutorial goes over recommended best practices for implementing Transformers with native PyTorch.
+   :image: _static/img/thumbnails/cropped/pytorch-logo.png
+   :link: intermediate/transformer_building_blocks.html
+   :tags: Transformer
+
 .. Parallel-and-Distributed-Training
 
 
diff --git a/intermediate_source/process_group_cpp_extension_tutorial.rst b/intermediate_source/process_group_cpp_extension_tutorial.rst
@@ -25,9 +25,8 @@ Basics
 
 PyTorch collective communications power several widely adopted distributed
 training features, including
-`DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`__,
-`ZeroRedundancyOptimizer <https://pytorch.org/docs/stable/distributed.optim.html#torch.distributed.optim.ZeroRedundancyOptimizer>`__,
-`FullyShardedDataParallel <https://github.com/pytorch/pytorch/blob/master/torch/distributed/_fsdp/fully_sharded_data_parallel.py>`__.
+`DistributedDataParallel <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html>`__ and
+`ZeroRedundancyOptimizer <https://pytorch.org/docs/stable/distributed.optim.html#torch.distributed.optim.ZeroRedundancyOptimizer>`__.
 In order to make the same collective communication API work with
 different communication backends, the distributed package abstracts collective
 communication operations into a
diff --git a/intermediate_source/rpc_async_execution.rst b/intermediate_source/rpc_async_execution.rst
@@ -199,7 +199,7 @@ speed.
 Batch-Processing CartPole Solver
 --------------------------------
 
-This section uses CartPole-v1 from `OpenAI Gym <https://gym.openai.com/>`__ as
+This section uses CartPole-v1 from OpenAI Gym as
 an example to show the performance impact of batch processing RPC. Please note
 that since the goal is to demonstrate the usage of
 `@rpc.functions.async_execution <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.functions.async_execution>`__
diff --git a/intermediate_source/transformer_building_blocks.py b/intermediate_source/transformer_building_blocks.py
diff --git a/lychee.toml b/lychee.toml