pytorch · svekars · Jan 24, 2024 · Dec 8, 2023 · Dec 8, 2023 · Dec 8, 2023
diff --git a/index.rst b/index.rst
@@ -623,6 +623,13 @@ What's new in PyTorch tutorials?
    :link: intermediate/torch_compile_tutorial.html
    :tags: Model-Optimization
 
+.. customcarditem::
+   :header: (beta) Compiling the Optimizer with torch.compile
+   :card_description: Speed up the optimizer using torch.compile
+   :image: _static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: recipes/compiling_optimizer.html
+   :tags: Model-Optimization
+
 .. customcarditem::
    :header: Inductor CPU Backend Debugging and Profiling
    :card_description: Learn the usage, debugging and performance profiling for ``torch.compile`` with Inductor CPU backend.
@@ -1046,6 +1053,7 @@ Additional Resources
    intermediate/nvfuser_intro_tutorial
    intermediate/ax_multiobjective_nas_tutorial
    intermediate/torch_compile_tutorial
+   recipes/compiling_optimizer
    intermediate/inductor_debug_cpu
    intermediate/scaled_dot_product_attention_tutorial
    beginner/knowledge_distillation_tutorial

diff --git a/recipes_source/compiling_optimizer.py b/recipes_source/compiling_optimizer.py
@@ -0,0 +1,71 @@
+"""
+(beta) Compiling the optimizer with torch.compile
+==========================================================================================
+
+
+**Author:** `Michale Lazos <https://github.com/mlazos>`_
+"""
+
+######################################################################
+# Summary
+# ~~~~~~~~
+#
+# In this tutorial we will apply torch.compile to the optimizer to observe
+# the GPU performance improvement
+#
+# .. note::
+#
+#   This tutorial requires PyTorch 2.2.0 or later.
+#
+
+
+######################################################################
+# Model Setup
+# ~~~~~~~~~~~~~~~~~~~~~
+# For this example we'll use a simple sequence of linear layers.
+# Since we are only benchmarking the optimizer, choice of model doesn't matter
+# because optimizer performance is a function of the number of parameters.
+#
+# Depending on what machine you are using, your exact results may vary.
+
+import torch
+
+model = torch.nn.Sequential(
+    *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)]
+)
+input = torch.rand(1024, device="cuda")
+output = model(input)
+output.sum().backward()
+
+#############################################################################
+# Setting up and running the optimizer benchmark
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# In this example, we'll use the Adam optimizer
+# and create a helper function to wrap the step()
+# in torch.compile()
+
+opt = torch.optim.Adam(model.parameters(), lr=0.01)
+
+
+@torch.compile()
+def fn():
+    opt.step()
+
+
+# Lets define a helpful benchmarking function:
+import torch.utils.benchmark as benchmark
+
+
+def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
+    t0 = benchmark.Timer(
+        stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+    )
+    return t0.blocked_autorange().mean * 1e6
+
+
+# Warmup runs to compile the function
+for _ in range(5):
+    fn()
+
+print(f"eager runtime: {benchmark_torch_function_in_microseconds(opt.step)}us")
+print(f"compiled runtime: {benchmark_torch_function_in_microseconds(fn)}us")