Mark fp8 buffers as static (#225)

drisspg · facebook-github-bot · commit 47facc83849c · 2024-02-26T10:14:06.000-08:00
Summary: Thank you eellison, based off of this repro: #119 (comment) Marking the individual buffers allows for cuda graphs to be used. Pull Request resolved: #225 Reviewed By: awgu Differential Revision: D54178086 Pulled By: drisspg fbshipit-source-id: 8797045b2a88825601b0fe7c8cadc03f557af96e
diff --git a/float8_experimental/float8_linear_utils.py b/float8_experimental/float8_linear_utils.py
@@ -148,7 +148,10 @@ def get_float8_layers(model: torch.nn.Module):
 
     # Get all fp8 layers and tensors
     fp8_layers = [child for child in model.modules() if isinstance(child, Float8Linear)]
-
+    if not torch._dynamo.is_compiling():
+        for layer in fp8_layers:
+            for buf in layer.buffers():
+                torch._dynamo.mark_static_address(buf, guard=True)
     return fp8_layers
 
 
@@ -290,7 +293,7 @@ def inner_func():
             fp8_dL_dY_amax_history_stack, torch.float8_e5m2, x_dtype, scale_fn_recipe
         )
 
-        # Iterate through the layers and update the scales, and set the flag to signal that the amaxes/scales are ready
+        # Iterate through the layers and update the scales
         for idx, child in enumerate(fp8_layers):
             child.fp8_scale_x.copy_(new_x_scales[idx])
             child.fp8_scale_w.copy_(new_w_scales[idx])
@@ -301,6 +304,5 @@ def inner_func():
     inner_func()
 
     for child in fp8_layers:
-        # 4. set a flag to signal amaxes/scales are ready
-        # We only update the flag if we know it will be checked by the modules
+        # Set a flag to signal amaxes/scales are ready
         child.amax_and_scale_synced = True
diff --git a/test/test_compile.py b/test/test_compile.py
@@ -5,14 +5,17 @@
 # LICENSE file in the root directory of this source tree.
 import copy
 import random
+import sys
 import unittest
+from io import StringIO
 
 import pytest
 
 import torch
 import torch.nn as nn
 from float8_experimental.float8_linear import Float8Linear
 from float8_experimental.float8_linear_utils import (
+    get_float8_layers,
     get_float8_linear,
     LinearType,
     swap_linear_with_float8_linear,
@@ -218,5 +221,43 @@ def test_sync_amax_func():
     assert cnts.frame_count == 1, "Compiled graph should have 1 frame!"
 
 
+class capture_stderr(list):
+    """
+    Replace sys.stderr with a temporary StringIO
+    """
+
+    def __enter__(self):
+        self.sys_stderr = sys.stderr
+        self.stringio = StringIO()
+        sys.stderr = self.stringio
+        return self
+
+    def __exit__(self, *args):
+        self.append(str(self.stringio.getvalue()))
+        del self.stringio
+        sys.stderr = self.sys_stderr
+
+
+@unittest.skipIf(not torch.cuda.is_available() or not is_H100, "CUDA not available")
+def test_sync_amax_func_cuda_graph_success():
+    torch._dynamo.reset()
+    with capture_stderr() as stderr:
+        my_module = nn.Sequential(
+            nn.Linear(16, 32, bias=True), nn.ReLU(), nn.Linear(32, 16, bias=True)
+        ).to("cuda")
+        swap_linear_with_float8_linear(my_module, Float8Linear)
+        inpt = torch.randn(
+            16, 16, device="cuda", dtype=torch.float32, requires_grad=True
+        )
+        sync_func = torch.compile(
+            sync_float8_amax_and_scale_history, mode="reduce-overhead", fullgraph=True
+        )
+        fp8_layers = get_float8_layers(my_module)
+        my_module(inpt)
+        sync_func(my_module, fp8_layers)
+
+    assert "skipping cudagraphs due to mutaton on input" not in stderr[0]
+
+
 if __name__ == "__main__":
     pytest.main([__file__])