make the backward of differentiable float8 casts pass gradient as is (#255)

vkuzo · facebook-github-bot · commit 605fc1d3dd8e · 2024-05-06T12:23:21.000-07:00
Summary: Behavior before: * high precision to float8 in fw, float8 to high precision in bw * float8 to high precision in fw, high precision to float8 in bw if grad is a Float8Tensor, pass gradient unchanged otherwise Behavior after: * high precision to float8 in fw, pass gradient unchanged in bw * float8 to high precision in fw, pass gradient unchanged in bw Motivation for the new state: 1. we want gradients to be in high precision unless specified otherwise by the float8 recipe, and the logic to specify grad casting to float8 before the matmul is better implemented elsewhere 2. there is actually no logic change in this diff as the backward casts were not getting hit from existing code, this diff just makes the intended behavior clearer Pull Request resolved: #255 Test Plan: ``` ./test/test_everything.sh ``` Reviewed By: drisspg, malfet, wanchaol Differential Revision: D56956823 Pulled By: vkuzo fbshipit-source-id: 1388420ad933a88986443effdf13ef1f8516138b
diff --git a/README.md b/README.md
@@ -118,7 +118,7 @@ pytest test/test_compile.py
 ./test/test_tp.sh
 
 # run all of these tests
-./test/run_everything.sh
+./test/test_everything.sh
 ```
 
 # Benchmarking
diff --git a/float8_experimental/float8_tensor.py b/float8_experimental/float8_tensor.py
@@ -112,43 +112,12 @@ def to_fp8_no_autograd(
     return Float8Tensor(bits_fp8, x_scale, x.dtype, mm_config=mm_config)
 
 
-def from_fp8_no_autograd(x: torch.Tensor) -> torch.Tensor:
-    """Convert a tensor from float8 without autograd
-
-    This function will handle 3 cases:
-        1. If the tensor is a DTensor, it will convert the inner tensor to the original precision
-        2. If the tensor is a Float8Tensor, it will convert the tensor to the original precision
-        3. If the tensor is a regular tensor, it will pass through this tensor
-
-    Args:
-        x: the tensor to convert
-    """
-
-    def to_original_precision(grad):
-        if isinstance(grad, Float8Tensor):
-            return grad.to_original_precision()
-        else:
-            return grad
-
-    if isinstance(x, DTensor):
-        local_grad = x.to_local()
-        original_precision_grad = to_original_precision(local_grad)
-        return DTensor.from_local(
-            original_precision_grad,
-            x.device_mesh,
-            x.placements,
-            run_check=False,
-            shape=x.size(),
-            stride=x.stride(),
-        )
-    else:
-        return to_original_precision(x)
-
-
 @torch._dynamo.allow_in_graph
 class ToFloat8ConstrFunc(torch.autograd.Function):
     """
-    A differentiable conversion to fp8
+    A differentiable conversion to fp8.
+    * forward: convert from high precision to float8
+    * backward: pass the gradient without changes
     """
 
     @staticmethod
@@ -175,14 +144,15 @@ def forward(
 
     @staticmethod
     def backward(ctx, g):
-        grad = from_fp8_no_autograd(g)
-        return grad, None, None, None, None
+        return g, None, None, None, None
 
 
 @torch._dynamo.allow_in_graph
 class FromFloat8ConstrFunc(torch.autograd.Function):
     """
-    A differentiable conversion from fp8
+    A differentiable conversion from fp8.
+    * forward: convert from float8 to high precision
+    * backward: pass the gradient without changes
     """
 
     @staticmethod
@@ -191,7 +161,7 @@ def forward(ctx, tensor):
 
     @staticmethod
     def backward(ctx, g):
-        return Float8Tensor.to_float8(g), None, None
+        return g, None, None
 
 
 class Float8Tensor(torch.Tensor):
diff --git a/test/test_base.py b/test/test_base.py
@@ -56,6 +56,18 @@ def test_preserves_dtype(self) -> None:
             x3_hp = x2_lp.to_original_precision()
             self.assertTrue(x3_hp.dtype == hp_dtype)
 
+    def test_differentiable_casts(self) -> None:
+        lp_dtypes = (torch.float8_e4m3fn, torch.float8_e5m2)
+        for f8_dtype in lp_dtypes:
+            x = torch.randn(1).requires_grad_()
+            grad = torch.randn(1)
+            x_s = tensor_to_scale(x, f8_dtype)
+            x_f8 = Float8Tensor.to_float8(x, x_s, f8_dtype)
+            x_f8_hp = x_f8.to_original_precision()
+            x_f8_hp.backward(grad)
+            # the gradient should be unchanged through both casts
+            torch.testing.assert_close(grad, x.grad, rtol=0, atol=0)
+
 
 class TestFloat8Linear:
     def _test_linear_impl(