still some numeric issues on amd:

drisspg · drisspg · commit 4a27a271e87e · 2024-03-08T13:08:03.000-08:00
FAILED test/test_base.py::TestFloat8Linear::test_linear_nobias[True-LinearType.DYNAMIC-x_shape0-False] - AssertionError: -3.183703660964966 is too low
FAILED test/test_base.py::TestFloat8Linear::test_linear_nobias[True-LinearType.DYNAMIC-x_shape1-False] - AssertionError: -3.2964067459106445 is too low
FAILED test/test_base.py::TestFloat8Linear::test_linear_nobias[True-LinearType.DYNAMIC-x_shape2-False] - AssertionError: -3.091813564300537 is too low
FAILED test/test_base.py::TestFloat8Linear::test_linear_nobias[False-LinearType.DELAYED-x_shape0-True] - AssertionError: 7.574269771575928 is too low
FAILED test/test_base.py::TestFloat8Linear::test_linear_nobias[False-LinearType.DELAYED-x_shape0-False] - AssertionError: -2.132262706756592 is too low
FAILED test/test_base.py::TestFloat8Linear::test_linear_nobias[False-LinearType.DELAYED-x_shape1-True] - AssertionError: 8.139453887939453 is too low
FAILED test/test_base.py::TestFloat8Linear::test_linear_nobias[False-LinearType.DELAYED-x_shape1-False] - AssertionError: -1.483538269996643 is too low
FAILED test/test_base.py::TestFloat8Linear::test_linear_nobias[False-LinearType.DELAYED-x_shape2-True] - AssertionError: 8.950117111206055 is too low
FAILED test/test_base.py::TestFloat8Linear::test_linear_nobias[False-LinearType.DELAYED-x_shape2-False] - AssertionError: -1.840381145477295 is too low
FAILED test/test_base.py::TestFloat8Linear::test_linear_nobias[False-LinearType.DYNAMIC-x_shape0-False] - AssertionError: -3.1304943561553955 is too low
FAILED test/test_base.py::TestFloat8Linear::test_linear_nobias[False-LinearType.DYNAMIC-x_shape1-False] - AssertionError: -3.246392250061035 is too low
FAILED test/test_base.py::TestFloat8Linear::test_linear_nobias[False-LinearType.DYNAMIC-x_shape2-False] - AssertionError: -3.015180826187134 is too low
diff --git a/float8_experimental/float8_dynamic_linear.py b/float8_experimental/float8_dynamic_linear.py
@@ -7,9 +7,10 @@
 A wrapper around a `torch.nn.Linear` module which does fp8 compute.
 """
 import torch
+from typing import Optional
 
 from float8_experimental.float8_tensor import Float8Tensor, to_fp8_no_autograd
-from float8_experimental.float8_utils import IS_AMD, tensor_to_scale
+from float8_experimental.float8_utils import IS_AMD, tensor_to_scale, FP8Dtypes
 
 
 @torch._dynamo.allow_in_graph
@@ -24,16 +25,17 @@ def forward(
         ctx,
         tensor,
         emulate: bool,
+        fp8_dtype_bw: torch.dtype
     ):
         ctx.emulate = emulate
+        ctx.fp8_dtype_bw = fp8_dtype_bw
         return tensor
 
     @staticmethod
     def backward(ctx, gradY):
-        fp8_dtype = torch.float8_e5m2fnuz if IS_AMD else torch.float8_e5m2
-        gradY_scale = tensor_to_scale(gradY, fp8_dtype)
-        fp8_tensor = to_fp8_no_autograd(gradY, gradY_scale, fp8_dtype, ctx.emulate)
-        return fp8_tensor, None
+        gradY_scale = tensor_to_scale(gradY, ctx.fp8_dtype_bw)
+        fp8_tensor = to_fp8_no_autograd(gradY, gradY_scale, ctx.fp8_dtype_bw, ctx.emulate)
+        return fp8_tensor, None, None
 
 
 def cast_x_to_float8_e4m3fn_pre_hook(module, args):
@@ -61,18 +63,23 @@ class Float8DynamicLinear(torch.nn.Linear):
     conversion to fp8 of the input and weight tensors.
     """
 
-    def __init__(self, use_activation_hooks: bool, **super_kwargs):
+    def __init__(self, use_activation_hooks: bool, fp8_dtype: FP8Dtypes, **super_kwargs):
         """
         Args:
             use_activation_hooks (bool): whether to use activation hooks for casting to and from float8
+            fp8_dtype (torch.dtype): the dtype to use for fp8
         """
         super().__init__(**super_kwargs)
 
         self.use_activation_hooks = use_activation_hooks
+        # I want to store the dataclass but I think that will break torch compile
+        self.fp8_dtype_fw = fp8_dtype.fp8_dtype_fw
+        self.fp8_dtype_bw = fp8_dtype.fp8_dtype_bw
+        self.emulate = False
 
-    def forward(self, x):
+    def forward(self, input):
         # cast x to float8_e4m3fn if not using activation hooks
-        x_fp8 = x if self.use_activation_hooks else self.cast_to_float8_e4m3(x)
+        x_fp8 = input if self.use_activation_hooks else self.cast_to_float8_e4m3(input)
 
         # cast w to float8_e4m3fn
         w_fp8 = self.cast_to_float8_e4m3(self.weight)
@@ -94,10 +101,9 @@ def cast_to_float8_e4m3(self, inpt_tensor: torch.Tensor) -> Float8Tensor:
         - On AMD Gpus, it casts to torch.float8_e4m3fnuz
 
         """
-        fp8_dtype = torch.float8_e4m3fnuz if IS_AMD else torch.float8_e4m3fn
-        scale = tensor_to_scale(inpt_tensor, fp8_dtype)
+        scale = tensor_to_scale(inpt_tensor, self.fp8_dtype_fw)
         return Float8Tensor.to_float8(
-            inpt_tensor, scale, fp8_dtype, emulate=self.emulate
+            inpt_tensor, scale, self.fp8_dtype_fw, emulate=self.emulate
         )
 
     def cast_to_float8_e5m2_bw(self, gradY: torch.Tensor) -> torch.Tensor:
@@ -110,11 +116,11 @@ def cast_to_float8_e5m2_bw(self, gradY: torch.Tensor) -> torch.Tensor:
         - On AMD Gpus, it casts to torch.float8_e4m3fnuz
 
         """
-        return NoopFwToFloat8E5M2Bw.apply(gradY, self.emulate)
+        return NoopFwToFloat8E5M2Bw.apply(gradY, self.emulate, self.fp8_dtype_bw)
 
     @classmethod
     def from_float(
-        cls, mod, emulate: bool = False, use_activation_hooks: bool = False
+        cls, mod, emulate: bool = False, use_activation_hooks: bool = False, fp8_dtypes: Optional[FP8Dtypes] = None
     ) -> "Float8DynamicLinear":
         """
         Create an nn.Linear with fp8 compute from a regular nn.Linear
@@ -124,13 +130,15 @@ def from_float(
             emulate (bool): whether to emulate fp8 matmul logic in float32
             use_activation_hooks (bool): whether to use activation hooks for casting to and from float8
         """
+        if fp8_dtypes is None:
+            fp8_dtypes = FP8Dtypes()
         with torch.device("meta"):
             super_kwargs = {
                 "in_features": mod.in_features,
                 "out_features": mod.out_features,
                 "bias": False,
             }
-            new_mod = cls(use_activation_hooks, **super_kwargs)
+            new_mod = cls(use_activation_hooks, fp8_dtypes, **super_kwargs)
         new_mod.weight = mod.weight
         new_mod.bias = mod.bias
         new_mod.emulate = emulate
diff --git a/float8_experimental/float8_linear.py b/float8_experimental/float8_linear.py
@@ -14,7 +14,7 @@
 
 import dataclasses
 
-from typing import Optional
+from typing import Optional, Literal
 
 import float8_experimental.config as config
 
@@ -27,17 +27,18 @@
     E4M3_MAX_POS,
     E5M2_MAX_POS,
     tensor_to_amax,
+    FP8Dtypes
 )
 
 
 def _maybe_initialize_amaxes_scales_for_float8_cast(
-    x,
-    cur_amax,
-    amax_history,
-    scale,
-    scale_fn_name,
-    float8_dtype,
-    is_initialized,
+    x: torch.Tensor,
+    cur_amax: torch.Tensor,
+    amax_history: torch.Tensor,
+    scale: torch.Tensor,
+    scale_fn_name: Literal["max"],
+    float8_dtype: torch.dtype,
+    is_initialized: bool,
 ):
     """
     If x is about to be cast to `float8` and the amax buffers are not initialized,
@@ -74,11 +75,13 @@ def forward(
         scale_fn_name,
         is_amax_initialized,
         emulate: bool,
+        fp8_dtype: torch.dtype,
     ):
         ctx.save_for_backward(fp8_amax_dL_dY, fp8_amax_history_dL_dY, fp8_scale_dL_dY)
         ctx.scale_fn_name = scale_fn_name
         ctx.is_amax_initialized = is_amax_initialized
         ctx.emulate = emulate
+        ctx.fp8_dtype = fp8_dtype
         return tensor
 
     @staticmethod
@@ -93,14 +96,14 @@ def backward(ctx, go):
             fp8_amax_history_dL_dY,
             fp8_scale_dL_dY,
             scale_fn_name,
-            torch.float8_e5m2,
+            ctx.fp8_dtype,
             is_amax_initialized,
         )
 
         fp8_amax_dL_dY.fill_(tensor_to_amax(go))
 
-        res = to_fp8_no_autograd(go, fp8_scale_dL_dY, torch.float8_e5m2, ctx.emulate)
-        empty_grads = None, None, None, None, None, None
+        res = to_fp8_no_autograd(go, fp8_scale_dL_dY, ctx.fp8_dtype, ctx.emulate)
+        empty_grads = None, None, None, None, None, None, None
         return res, *empty_grads
 
 
@@ -178,6 +181,14 @@ def __init__(self, *args, **kwargs):
         # and torch.compile, this option can disable them
         self.enable_pre_and_post_forward = config.enable_pre_and_post_forward
 
+        # In the forward we will cast both the activation and weight to float8
+        # There currenlty 4 different variants in pytorch, see
+        # https://github.com/openxla/stablehlo/blob/main/rfcs/20230321-fp8_fnuz.md
+        # fp8_dtype_fw will be the tyep used for casting the activation and weight
+        # fp8_dtype_bw will be the typeused for casting the gradient
+        self.fp8_dtype_fw = torch.float8_e4m3fn
+        self.fp8_dtype_bw = torch.float8_e5m2
+
     def register_always_float32_buffer(
         self, name: str, tensor: Optional[torch.Tensor], persistent: bool = True
     ) -> None:
@@ -212,11 +223,11 @@ def cast_x_to_float8(
             self.fp8_amax_history_x,
             self.fp8_scale_x,
             scale_fn_name,
-            torch.float8_e4m3fn,
+            self.fp8_dtype_fw,
             is_amax_initialized,
         )
         x_fp8 = Float8Tensor.to_float8(
-            x, self.fp8_scale_x, torch.float8_e4m3fn, self.fp8_amax_x, self.emulate
+            x, self.fp8_scale_x, self.fp8_dtype_fw, self.fp8_amax_x, self.emulate
         )
         return x_fp8
 
@@ -230,14 +241,14 @@ def cast_w_to_float8(
             self.fp8_amax_history_w,
             self.fp8_scale_w,
             scale_fn_name,
-            torch.float8_e4m3fn,
+            self.fp8_dtype_fw,
             is_amax_initialized,
         )
 
         w_fp8 = Float8Tensor.to_float8(
             w,
             self.fp8_scale_w,
-            torch.float8_e4m3fn,
+            self.fp8_dtype_fw,
             self.fp8_amax_w,
             self.emulate,
         )
@@ -255,6 +266,7 @@ def cast_y_to_float8_in_bw(
             scale_fn_name,
             self.is_amax_initialized,
             emulate,
+            self.fp8_dtype_bw,
         )
         return y
 
@@ -286,10 +298,10 @@ class Float8Linear(Float8LinearMixin, torch.nn.Linear):
     scales in way friendly to delayed scaling.
     """
 
-    def forward(self, x):
-        self.float8_pre_forward(x)
+    def forward(self, input):
+        self.float8_pre_forward(input)
 
-        x_fp8 = self.cast_x_to_float8(x, self.is_amax_initialized)
+        x_fp8 = self.cast_x_to_float8(input, self.is_amax_initialized)
         w_fp8 = self.cast_w_to_float8(self.weight, self.is_amax_initialized)
 
         y = torch.matmul(x_fp8, w_fp8.t())
@@ -304,7 +316,7 @@ def forward(self, x):
         return y
 
     @classmethod
-    def from_float(cls, mod, emulate: bool = False, use_activation_hooks: bool = False):
+    def from_float(cls, mod, emulate: bool = False, use_activation_hooks: bool = False, fp8_dtypes: Optional[FP8Dtypes] = None):
         """
         Create an nn.Linear with fp8 compute from a regular nn.Linear
 
@@ -314,13 +326,17 @@ def from_float(cls, mod, emulate: bool = False, use_activation_hooks: bool = Fal
             use_activation_hooks (bool): whether to use activation hooks instead of inlining the casting logic
         """
         assert not use_activation_hooks, "use_activation_hooks is not supported yet!"
+        if fp8_dtypes is None:
+            fp8_dtypes = FP8Dtypes()
         # TODO Follow up! This is a great idea but we need the mixin base to create real
         # Tensors and the Linear base to create empty params
         # with torch.device("meta"):
         new_mod = cls(mod.in_features, mod.out_features, bias=False)
         new_mod.weight = mod.weight
         new_mod.bias = mod.bias
         new_mod.emulate = emulate
+        new_mod.fp8_dtype_fw = fp8_dtypes.fp8_dtype_fw
+        new_mod.fp8_dtype_bw = fp8_dtypes.fp8_dtype_bw
         # I think its okay to send all params and buffers to device
         new_mod.to(mod.weight.device)
         return new_mod
diff --git a/float8_experimental/float8_linear_utils.py b/float8_experimental/float8_linear_utils.py
@@ -14,7 +14,7 @@
 from float8_experimental.float8_dynamic_linear import Float8DynamicLinear
 from float8_experimental.float8_linear import Float8Linear
 
-from float8_experimental.float8_utils import amax_history_to_scale_stack
+from float8_experimental.float8_utils import amax_history_to_scale_stack, FP8Dtypes
 from torch.distributed._functional_collectives import all_reduce, AsyncCollectiveTensor
 
 log = logging.getLogger(__name__)
@@ -34,13 +34,15 @@ def get_float8_linear(
     linear_ref: torch.nn.Linear,
     emulate: bool = False,
     use_activation_hooks: bool = False,
+    fp8_dtypes: Optional[FP8Dtypes] = None,
 ):
     """Returns a Float8Linear module of the given type, initialized from linear_ref.
     Args:
         linear_type: The type of Float8Linear to return.
         linear_ref: The linear module to initialize from.
         emulate: Whether to emulate the fp8 matmul logic in float32.
         use_activation_hooks: Whether to use activation hooks for dynamic linear.
+        fp8_dtypes: The FP8 dtypes to use.
     """
     LINEAR_TYPE_MAP = {
         LinearType.DELAYED: Float8Linear,
@@ -54,6 +56,7 @@ def get_float8_linear(
         copy.deepcopy(linear_ref),
         emulate=emulate,
         use_activation_hooks=use_activation_hooks,
+        fp8_dtypes=fp8_dtypes,
     )
 
 
diff --git a/float8_experimental/float8_tensor.py b/float8_experimental/float8_tensor.py
@@ -230,7 +230,7 @@ def to_float8(
         float8_dtype: torch.dtype,
         amax_buffer: Optional[torch.Tensor] = None,
         emulate: bool = False,
-    ):
+    )-> "Float8Tensor":
         """Converts a higher precision tensor to float8 in a differentiable way.
 
         Args:
diff --git a/float8_experimental/float8_utils.py b/float8_experimental/float8_utils.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from typing import Literal
+from dataclasses import dataclass
 
 import torch
 import torch.distributed as dist
@@ -27,6 +28,12 @@
 IS_AMD = torch.cuda.is_available() and torch.version.hip is not None
 
 
+@dataclass(frozen=True)
+class FP8Dtypes:
+    """ Defines the fp8 dtypes to be used in forward and backwrad computations"""
+    fp8_dtype_fw: torch.dtype = torch.float8_e4m3fn
+    fp8_dtype_bw: torch.dtype = torch.float8_e5m2
+
 @torch.no_grad()
 def amax_to_scale(
     amax: torch.Tensor, float8_dtype: torch.dtype, orig_dtype: torch.dtype
@@ -61,7 +68,7 @@ def amax_to_scale(
 @torch.no_grad()
 def amax_history_to_scale(
     amax_history: torch.Tensor,
-    float8_dtype: torch.Tensor,
+    float8_dtype: torch.dtype,
     orig_dtype: torch.dtype,
     history_to_scale_fn_type: Literal["max"],
 ):
diff --git a/test/test_base.py b/test/test_base.py