Adds utilities for AMD fp8 dtype support, follow up PR to add option to the configs (#235)

drisspg · facebook-github-bot · commit 5fc07fcd4e68 · 2024-06-04T12:36:18.000-07:00
Summary: AMD GPUS support a different fp8 dtype compared to nvidia. These dtypes were added to PyTorch and we update Float8Tensor construction to use the format dependent on the arch. For a detailed summary see: https://github.com/openxla/stablehlo/blob/main/rfcs/20230321-fp8_fnuz.md Pull Request resolved: #235 Reviewed By: malfet Differential Revision: D58044802 Pulled By: drisspg fbshipit-source-id: fed15edaceceaa79b3fbcc9644dd51aee3641dd6
diff --git a/float8_experimental/float8_dynamic_linear.py b/float8_experimental/float8_dynamic_linear.py
@@ -88,7 +88,7 @@ def from_float(cls, mod, emulate: bool = False) -> "Float8DynamicLinear":
                 "bias": False,
             }
             new_mod = cls(**super_kwargs)
-        new_mod.forward_config = ScaledMMConfig(emulate, True if not emulate else False)
+        new_mod.forward_config = ScaledMMConfig(emulate, not bool(emulate))
         new_mod.backward_config = ScaledMMConfig(emulate, False)
         if config.enable_fsdp_fp8_all_gather:
             new_mod.weight = nn.Parameter(
diff --git a/float8_experimental/float8_linear.py b/float8_experimental/float8_linear.py
@@ -26,12 +26,7 @@
     to_fp8_no_autograd,
 )
 
-from float8_experimental.float8_utils import (
-    amax_history_to_scale,
-    E4M3_MAX_POS,
-    E5M2_MAX_POS,
-    tensor_to_amax,
-)
+from float8_experimental.float8_utils import amax_history_to_scale, tensor_to_amax
 
 
 def _maybe_initialize_amaxes_scales_for_float8_cast(
@@ -142,18 +137,23 @@ def __init__(self, *args, **kwargs):
         self.recipe = delayed_scaling_recipe
         history_len = self.recipe.history_len
 
-        self.register_always_float32_buffer("fp8_amax_x", torch.tensor([E4M3_MAX_POS]))
+        # Default values for history buffers, see above TODO
+        default_x = torch.finfo(torch.float8_e4m3fn).max
+        default_w = torch.finfo(torch.float8_e4m3fn).max
+        default_dl_dy = torch.finfo(torch.float8_e5m2).max
+
+        self.register_always_float32_buffer("fp8_amax_x", torch.tensor([default_x]))
         self.register_always_float32_buffer(
             "fp8_amax_history_x", torch.zeros(history_len)
         )
         self.register_always_float32_buffer("fp8_scale_x", torch.tensor([1.0]))
-        self.register_always_float32_buffer("fp8_amax_w", torch.tensor([E4M3_MAX_POS]))
+        self.register_always_float32_buffer("fp8_amax_w", torch.tensor([default_w]))
         self.register_always_float32_buffer(
             "fp8_amax_history_w", torch.zeros(history_len)
         )
         self.register_always_float32_buffer("fp8_scale_w", torch.tensor([1.0]))
         self.register_always_float32_buffer(
-            "fp8_amax_dL_dY", torch.tensor([E5M2_MAX_POS])
+            "fp8_amax_dL_dY", torch.tensor([default_dl_dy])
         )
         self.register_always_float32_buffer(
             "fp8_amax_history_dL_dY", torch.zeros(history_len)
diff --git a/float8_experimental/float8_utils.py b/float8_experimental/float8_utils.py
@@ -4,49 +4,66 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Tuple
+from typing import Literal, Tuple
 
 import torch
 import torch.distributed as dist
 
 # Helpful visualizer for debugging (only supports fp32):
 # https://www.h-schmidt.net/FloatConverter/IEEE754.html
 
-# define the e4m3/e5m2 constants
-E4M3_MAX_POS = torch.finfo(torch.float8_e4m3fn).max
-E5M2_MAX_POS = torch.finfo(torch.float8_e5m2).max
-
-FP16_MAX_POS = torch.finfo(torch.float16).max
-
 # avoid division by zero when calculating scale
 # TODO: align this value with NVIDIA's assumptions (current value is a guess)
 EPS = 1e-12
 
+IS_AMD = torch.cuda.is_available() and torch.version.hip is not None
+FP8_TYPES = {
+    torch.float8_e4m3fn,
+    torch.float8_e5m2,
+    torch.float8_e4m3fnuz,
+    torch.float8_e5m2fnuz,
+}
+
 
 @torch.no_grad()
-def amax_to_scale(amax, float8_dtype, orig_dtype):
+def amax_to_scale(
+    amax: torch.Tensor, float8_dtype: torch.dtype, orig_dtype: torch.dtype
+):
+    """Converts the amax value of a tensor to the fp8 scale.
+    Args:
+        amax: The amax value of the tensor.
+        float8_dtype: The float8 dtype.
+        orig_dtype: The original dtype of the tensor.
+    """
     scale = torch.empty_like(amax, dtype=torch.float32)
-    if float8_dtype == torch.float8_e4m3fn:
-        res = E4M3_MAX_POS / torch.clamp(amax, min=EPS)
-    else:  # e5m2
-        res = E5M2_MAX_POS / torch.clamp(amax, min=EPS)
+    if float8_dtype in FP8_TYPES:
+        res = torch.finfo(float8_dtype).max / torch.clamp(amax, min=EPS)
+    else:
+        raise ValueError(f"Unsupported float8_dtype: {float8_dtype}")
 
     # Ensure that the scale is representable in float16,
     # this helps when amax is small. We are assuming that we don't need
     # to care about this for float32/bfloat16.
     if orig_dtype is torch.float16:
-        res = torch.clamp(res, max=FP16_MAX_POS)
+        res = torch.clamp(res, max=torch.finfo(torch.float16).max)
     scale.copy_(res)
     return scale
 
 
 @torch.no_grad()
 def amax_history_to_scale(
-    amax_history,
-    float8_dtype,
-    orig_dtype,
-    history_to_scale_fn_type,
+    amax_history: torch.Tensor,
+    float8_dtype: torch.Tensor,
+    orig_dtype: torch.dtype,
+    history_to_scale_fn_type: Literal["max"],
 ):
+    """Takes in a history of amax values and returns a scale tensor.
+    Args:
+        amax_history: A tensor containing the history of amax values.
+        float8_dtype: The float8 dtype.
+        orig_dtype: The original dtype of the tensor.
+        history_to_scale_fn_type: The type of function to use to convert the history to a scale.
+    """
     if history_to_scale_fn_type == "max":
         amax = torch.max(amax_history)
         return amax_to_scale(amax, float8_dtype, orig_dtype)
@@ -58,9 +75,15 @@ def amax_history_to_scale_stack(
     amax_history: torch.Tensor,
     float8_dtype: torch.dtype,
     orig_dtype: torch.dtype,
-    history_to_scale_fn_type: str,
+    history_to_scale_fn_type: Literal["max"],
 ) -> torch.Tensor:
-    """Takes in a stack of amax_history tensors and returns a scale tensor."""
+    """Takes in a stack of amax_history tensors and returns a scale tensor.
+    Args:
+        amax_history: A 2D tensor containing a stack of amax histories.
+        float8_dtype: The float8 dtype.
+        orig_dtype: The original dtype of the tensor.
+        history_to_scale_fn_type: The type of function to use to convert the history to a scale.
+    """
     if history_to_scale_fn_type == "max":
         amax_stack = torch.max(amax_history, dim=1).values
         return amax_to_scale(amax_stack, float8_dtype, orig_dtype)
@@ -90,21 +113,35 @@ def tensor_to_scale(
     return amax_to_scale(amax, float8_dtype, x.dtype)
 
 
-def to_fp8_saturated(x, float8_dtype: torch.dtype):
-    # The default behavior in PyTorch for casting to `float8_e4m3fn`
-    # and `e5m2` is to not saturate. In this context, we should saturate.
-    # A common case where we want to saturate is when the history of a
-    # tensor has a maximum value of `amax1`, and the current amax value
-    # is `amax2`, where `amax1 < amax2`. This is common when using delayed
-    # scaling.
-    if float8_dtype == torch.float8_e4m3fn:
-        x = x.clamp(min=-1 * E4M3_MAX_POS, max=E4M3_MAX_POS)
+def to_fp8_saturated(x: torch.Tensor, float8_dtype: torch.dtype):
+    """Converts a tensor to a saturated fp8 tensor.
+
+    Note:
+        The default behavior in PyTorch for casting to `float8_e4m3fn`
+        and `e5m2` is to not saturate. In this context, we should saturate.
+        A common case where we want to saturate is when the history of a
+        tensor has a maximum value of `amax1`, and the current amax value
+        is `amax2`, where `amax1 < amax2`. This is common when using delayed
+        scaling.
+    """
+    if float8_dtype in FP8_TYPES:
+        max_value = torch.finfo(float8_dtype).max
+        x = x.clamp(min=-max_value, max=max_value)
+        return x.to(float8_dtype)
     else:
-        x = x.clamp(min=-1 * E5M2_MAX_POS, max=E5M2_MAX_POS)
-    return x.to(float8_dtype)
+        raise ValueError(f"Unsupported float8_dtype: {float8_dtype}")
+
+
+def compute_error(x: torch.Tensor, y: torch.Tensor):
+    """Computes the error between two tensors in dB.
 
+    For more details see:
+        https://en.wikipedia.org/wiki/Signal-to-noise_ratio
 
-def compute_error(x, y):
+    Args:
+        x: The original tensor.
+        y: The tensor to compare to the original tensor.
+    """
     Ps = torch.norm(x)
     Pn = torch.norm(x - y)
     return 20 * torch.log10(Ps / Pn)
@@ -113,11 +150,19 @@ def compute_error(x, y):
 def fp8_tensor_statistics(
     tensor: torch.Tensor, float8_dtype=torch.float8_e4m3fn
 ) -> Tuple[int, ...]:
-    """Calculate FP8 tensor stats"""
-    if float8_dtype == torch.float8_e4m3fn:
-        FP8_MAX = E4M3_MAX_POS
-    else:  # e5m2
-        FP8_MAX = E5M2_MAX_POS
+    """Calculate FP8 tensor stats
+
+    Args:
+        tensor: The tensor to calculate stats for.
+        float8_dtype: The float8 dtype.
+
+    Returns:
+        A tuple containing the number of zeros and the number of max values.
+    """
+    if float8_dtype in FP8_TYPES:
+        FP8_MAX = torch.finfo(float8_dtype).max
+    else:
+        raise ValueError(f"Unsupported float8_dtype: {float8_dtype}")
     tensor_orig_type = tensor._data.to(dtype=tensor._orig_dtype)
     num_max = (torch.abs(tensor_orig_type) == FP8_MAX).sum().item()
     num_zero = (tensor_orig_type == 0).sum().item()
diff --git a/test/test_base.py b/test/test_base.py
@@ -31,10 +31,8 @@
 from float8_experimental.float8_utils import (
     amax_to_scale,
     compute_error,
-    E4M3_MAX_POS,
-    E5M2_MAX_POS,
-    FP16_MAX_POS,
     fp8_tensor_statistics,
+    FP8_TYPES,
     tensor_to_scale,
 )
 
@@ -118,9 +116,10 @@ def _test_linear_impl(
                 "fp8_amax_w",
                 "fp8_amax_dL_dY",
             ]
+            max_float8_pos = {torch.finfo(dtype).max for dtype in FP8_TYPES}
             for buffer_name in amax_buffer_names:
                 buffer_value = getattr(m_fp8, buffer_name)
-                for init_val in (E4M3_MAX_POS, E5M2_MAX_POS):
+                for init_val in max_float8_pos:
                     assert torch.ne(
                         buffer_value, torch.tensor(init_val)
                     ), f"{buffer_name} not filled, current value {buffer_value}"
@@ -412,9 +411,8 @@ def test_small_amax_float16(self, float8_dtype):
         #
         #   amax + eps >= fp8_max_pos / fp16_max_pos
 
-        float8_max_pos = (
-            E4M3_MAX_POS if float8_dtype is torch.float8_e4m3fn else E5M2_MAX_POS
-        )
+        float8_max_pos = torch.finfo(float8_dtype).max
+        FP16_MAX_POS = torch.finfo(torch.float16).max
 
         target_amax = float8_max_pos / (FP16_MAX_POS + 1e-12)
         x = torch.tensor([target_amax], dtype=torch.float16, device="cuda")

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ def from_float(cls, mod, emulate: bool = False) -> "Float8DynamicLinear":`
`88`	`88`	`"bias": False,`
`89`	`89`	`}`
`90`	`90`	`new_mod = cls(**super_kwargs)`
`91`		`- new_mod.forward_config = ScaledMMConfig(emulate, True if not emulate else False)`
	`91`	`+ new_mod.forward_config = ScaledMMConfig(emulate, not bool(emulate))`
`92`	`92`	`new_mod.backward_config = ScaledMMConfig(emulate, False)`
`93`	`93`	`if config.enable_fsdp_fp8_all_gather:`
`94`	`94`	`new_mod.weight = nn.Parameter(`