standardize on use to_fp8_no_autograd

drisspg · drisspg · commit ca84eb4fab8a · 2024-02-26T19:37:45.000-08:00
diff --git a/float8_experimental/float8_linear.py b/float8_experimental/float8_linear.py
@@ -20,14 +20,13 @@
 
 import torch
 
-from float8_experimental.float8_tensor import Float8Tensor
+from float8_experimental.float8_tensor import Float8Tensor, to_fp8_no_autograd
 
 from float8_experimental.float8_utils import (
     amax_history_to_scale,
     E4M3_MAX_POS,
     E5M2_MAX_POS,
     tensor_to_amax,
-    to_fp8_saturated,
 )
 
 
@@ -99,10 +98,9 @@ def backward(ctx, go):
         )
 
         fp8_amax_dL_dY.fill_(tensor_to_amax(go))
-        go_scaled = go * fp8_scale_dL_dY
-        bits_fp8 = to_fp8_saturated(go_scaled, torch.float8_e5m2)
+
+        res = to_fp8_no_autograd(go, fp8_scale_dL_dY, torch.float8_e5m2, ctx.emulate)
         empty_grads = None, None, None, None, None, None
-        res = Float8Tensor(bits_fp8, fp8_scale_dL_dY, go.dtype, emulate=ctx.emulate)
         return res, *empty_grads
 
 
diff --git a/float8_experimental/float8_utils.py b/float8_experimental/float8_utils.py
@@ -86,7 +86,7 @@ def tensor_to_scale(x, float8_dtype):
     return amax_to_scale(amax, float8_dtype, x.dtype)
 
 
-def to_fp8_saturated(x, float8_dtype):
+def to_fp8_saturated(x, float8_dtype: torch.dtype):
     # The default behavior in PyTorch for casting to `float8_e4m3fn`
     # and `e5m2` is to not saturate. In this context, we should saturate.
     # A common case where we want to saturate is when the history of a