pytorch-labs · weifengpy · May 21, 2024 · May 21, 2024 · May 21, 2024 · May 24, 2024
diff --git a/float8_experimental/float8_dynamic_linear.py b/float8_experimental/float8_dynamic_linear.py
@@ -22,7 +22,7 @@
     tensor_already_casted_to_fp8,
     to_fp8_no_autograd,
 )
-from float8_experimental.float8_utils import tensor_to_scale
+from float8_experimental.float8_utils import amax_to_scale, tensor_to_scale
 from torch._prims_common import suggest_memory_format
 
 
@@ -144,13 +144,19 @@ def __new__(cls, tensor: torch.Tensor, mm_config: ScaledMMConfig):
             dtype=tensor.dtype,
             layout=tensor.layout,
             device=tensor.device,
-            pin_memory=tensor.is_pinned(),
+            # TODO: workaround fake tensor not implementing is.pinned
+            # pin_memory=tensor.is_pinned(),
+            pin_memory=False,
             requires_grad=tensor.requires_grad,
         )
 
     def __init__(self, tensor: torch.Tensor, mm_config: ScaledMMConfig):
         self._tensor = tensor
         self._mm_config = mm_config
+        # Optional cache for pre-computed fp8 data/scale
+        self._fp8_data: Optional[torch.Tensor] = None
+        self._fp8_scale: Optional[torch.Tensor] = None
+        self._fp8_amax: Optional[torch.Tensor] = None
 
     @classmethod
     def __torch_dispatch__(cls, func, types, args, kwargs=None):
@@ -190,9 +196,22 @@ def __repr__(self):
         return f"WeightWithDynamicFloat8CastTensor(tensor={self._tensor}, mm_config={self._mm_config})"
 
     def fsdp_pre_all_gather(self, mesh):
-        float8_tensor = cast_to_float8_e4m3fn(
-            self._tensor, self._mm_config, reduce_amax=True
-        )
+        if self._fp8_data is not None and self._fp8_scale is not None:
+            return (self._fp8_data,), (self._fp8_scale,)
+        if self._fp8_amax is not None:
+            scale = amax_to_scale(
+                self._fp8_amax,
+                torch.float8_e4m3fn,
+                self._fp8_amax.dtype,
+                clamp_amax=False,
+            )
+            float8_tensor = Float8Tensor.to_float8(
+                self._tensor, scale, torch.float8_e4m3fn, mm_config=self._mm_config
+            )
+        else:
+            float8_tensor = cast_to_float8_e4m3fn(
+                self._tensor, self._mm_config, reduce_amax=True
+            )
         return (float8_tensor._data,), (float8_tensor._scale,)
 
     def fsdp_post_all_gather(

diff --git a/float8_experimental/float8_linear_utils.py b/float8_experimental/float8_linear_utils.py
@@ -5,16 +5,25 @@
 # LICENSE file in the root directory of this source tree.
 import copy
 import logging
+import warnings
 from enum import auto, Enum
 from typing import Callable, List, Optional, Type
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from float8_experimental.float8_dynamic_linear import Float8DynamicLinear
+from float8_experimental.float8_dynamic_linear import (
+    Float8DynamicLinear,
+    WeightWithDynamicFloat8CastTensor,
+)
 from float8_experimental.float8_linear import Float8Linear
 
-from float8_experimental.float8_utils import amax_history_to_scale_stack
+from float8_experimental.float8_utils import (
+    amax_history_to_scale_stack,
+    E4M3_MAX_POS,
+    EPS,
+    to_fp8_saturated,
+)
 from torch.distributed._functional_collectives import all_reduce, AsyncCollectiveTensor
 
 log = logging.getLogger(__name__)
@@ -322,3 +331,79 @@ def inner_func():
     for child in fp8_layers:
         # Set a flag to signal amaxes/scales are ready
         child.amax_and_scale_synced = True
+
+
+def precompute_float8_amax(module: nn.Module) -> None:
+    from torch.distributed._tensor import DTensor
+
+    if any(isinstance(m, Float8Linear) for m in module.modules()):
+        raise NotImplementedError("Only supports Float8DynamicLinear, not Float8Linear")
+    float8_linears: List[Float8DynamicLinear] = [
+        m
+        for m in module.modules()
+        if isinstance(m, Float8DynamicLinear)
+        and isinstance(m.weight, DTensor)
+        and isinstance(m.weight._local_tensor, WeightWithDynamicFloat8CastTensor)
+    ]
+    weights: List[DTensor] = [float8_linear.weight for float8_linear in float8_linears]
+
+    def compute_amaxes(weights: List[DTensor]):
+        abs_weights = torch._foreach_abs(weights)  # S0
+        amax_tensor = torch.vstack([torch.max(a) for a in abs_weights])  # P
+        amax_tensor = torch.clamp(amax_tensor, EPS)  # R
+        amaxes = torch.split(amax_tensor, 1)  # R
+        return amaxes
+
+    if weights:
+        # amaxes = compute_amaxes(weights)
+        # amaxes = torch.compile(compute_amaxes, mode="reduce-overhead")(weights)
+        amaxes = torch.compile(compute_amaxes)(weights)
+        for amax, float8_linear in zip(amaxes, float8_linears):
+            float8_linear.weight._local_tensor._fp8_amax = amax._local_tensor
+    else:
+        warnings.warn(
+            "Calling precompute_float8_weights without any weights using FSDP fp8 all-gather!"
+        )
+
+
+def precompute_float8_weights(module: nn.Module) -> None:
+    from torch.distributed._tensor import DTensor
+
+    if any(isinstance(m, Float8Linear) for m in module.modules()):
+        raise NotImplementedError("Only supports Float8DynamicLinear, not Float8Linear")
+    float8_linears: List[Float8DynamicLinear] = [
+        m
+        for m in module.modules()
+        if isinstance(m, Float8DynamicLinear)
+        and isinstance(m.weight, DTensor)
+        and isinstance(m.weight._local_tensor, WeightWithDynamicFloat8CastTensor)
+    ]
+    weights: List[DTensor] = [float8_linear.weight for float8_linear in float8_linears]
+
+    def compute_weights_and_scales(weights: List[DTensor]):
+        abs_weights = torch._foreach_abs(weights)  # S0
+        # abs_weights = [torch.abs(w) for w in weights]
+        amax_tensor = torch.vstack([torch.max(a) for a in abs_weights])  # P
+        amax_tensor = torch.clamp(amax_tensor, EPS)  # R
+        scales_tensor = E4M3_MAX_POS / amax_tensor  # R
+        scales = torch.split(scales_tensor, 1)  # R
+        weights_scaled = torch._foreach_mul(weights, scales)  # S0
+        datas = [to_fp8_saturated(w, torch.float8_e4m3fn) for w in weights_scaled]  # S0
+        # torch._foreach_clamp_min_(weights_scaled, -1 * E4M3_MAX_POS)
+        # torch._foreach_clamp_max_(weights_scaled, E4M3_MAX_POS)
+        # datas = [w.to(torch.float8_e4m3fn) for w in weights_scaled]
+        return datas, scales
+
+    if weights:
+        # datas, scales = compute_weights_and_scales(weights)
+        datas, scales = torch.compile(compute_weights_and_scales)(weights)
+        # datas, scales = torch.compile(compute_weights_and_scales, mode="reduce-overhead")(weights)
+        for data, scale, float8_linear in zip(datas, scales, float8_linears):
+            float8_linear.weight._local_tensor._fp8_data = data._local_tensor
+            float8_linear.weight._local_tensor._fp8_scale = (
+                scale._local_tensor.squeeze()
+            )
+    else:
+        warnings.warn(
+            "Calling precompute_float8_weights without any weights using FSDP fp8 all-gather!"
+        )
diff --git a/float8_experimental/float8_utils.py b/float8_experimental/float8_utils.py
@@ -24,12 +24,18 @@
 
 
 @torch.no_grad()
-def amax_to_scale(amax, float8_dtype, orig_dtype):
+def amax_to_scale(amax, float8_dtype, orig_dtype, clamp_amax=True):
     scale = torch.empty_like(amax, dtype=torch.float32)
     if float8_dtype == torch.float8_e4m3fn:
-        res = E4M3_MAX_POS / torch.clamp(amax, min=EPS)
+        if clamp_amax:
+            res = E4M3_MAX_POS / torch.clamp(amax, min=EPS)
+        else:
+            res = E4M3_MAX_POS / amax
     else:  # e5m2
-        res = E5M2_MAX_POS / torch.clamp(amax, min=EPS)
+        if clamp_amax:
+            res = E5M2_MAX_POS / torch.clamp(amax, min=EPS)
+        else:
+            res = E5M2_MAX_POS / amax
 
     # Ensure that the scale is representable in float16,
     # this helps when amax is small. We are assuming that we don't need

diff --git a/test/test_fsdp2/test_fsdp2_eager.py b/test/test_fsdp2/test_fsdp2_eager.py
@@ -1,7 +1,7 @@
 import copy
 import threading
 import unittest
-from typing import Any, List
+from typing import Any, List, Union
 
 import torch
 import torch._dynamo.testing
@@ -11,7 +11,11 @@
     Float8DynamicLinear,
     WeightWithDynamicFloat8CastTensor,
 )
-from float8_experimental.float8_linear_utils import swap_linear_with_float8_linear
+from float8_experimental.float8_linear_utils import (
+    precompute_float8_amax,
+    precompute_float8_weights,
+    swap_linear_with_float8_linear,
+)
 from test_fsdp2_common import (
     check_parity_bf16_mp,
     check_parity_no_mp,
@@ -57,12 +61,13 @@ def init_multi_module(self) -> nn.Module:
     def init_transformer(self, weight_tying: bool) -> nn.Module:
         torch.manual_seed(42)
         args = ModelArgs(
-            n_layers=3,
-            dim=768,
-            n_heads=12,
+            n_layers=8,
+            dim=4096,
+            n_heads=32,
             dropout_p=0.0,
             weight_tying=weight_tying,
-            vocab_size=32,
+            vocab_size=4096,
+            max_seq_len=4096,
         )
         module = Transformer(args).cuda()
         self.broadcast_module(module)
@@ -78,17 +83,55 @@ def swap_linear_with_dynamic(self, module: nn.Module, **kwargs: Any) -> nn.Modul
         return swap_linear_with_float8_linear(module, Float8DynamicLinear, **kwargs)
 
 
+def profiler(output_dir):
+    """
+    Utility component that wraps around `torch.profiler` to profile model's operators.
+    See https://pytorch.org/docs/stable/profiler.html for more details.
+    The schedule for this profiler is wait 100 steps, warmup 5 steps, trace 5 steps
+    Note: Enabling pytorch profiler may have training speed reduction.
+
+    Args:
+        enabled (Optional[bool]): Enable pytorch profiler. Default is False.
+        output_dir (Optional[str]): Tracing file output path. Default is "./torchtune_perf_tracing.json".
+
+    Returns:
+        ContextManager: pytorch profiler context manager
+    """
+
+    def trace_handler(prof) -> None:
+        prof.export_chrome_trace(output_dir)
+
+    return torch.profiler.profile(
+        activities=[
+            torch.profiler.ProfilerActivity.CPU,
+            torch.profiler.ProfilerActivity.CUDA,
+        ],
+        schedule=torch.profiler.schedule(
+            wait=0, warmup=1, active=2, repeat=1, skip_first=1
+        ),
+        on_trace_ready=trace_handler,
+        record_shapes=True,
+        profile_memory=False,
+        with_stack=False,
+    )
+
+
 class TestFloat8MultiProcess(FSDPTest, TestFloat8Common):
     @property
     def world_size(self) -> int:
         return min(torch.cuda.device_count(), 2)
 
     @skip_if_lt_x_gpu(2)
     def test_transformer_parity_dynamic(self):
-        for enable_fsdp_fp8_all_gather in [False, True]:
-            self._test_transformer_parity_dynamic(enable_fsdp_fp8_all_gather)
+        for enable_fsdp_fp8_all_gather in [True]:
+            for pre_compute in [None, "cast", "amax"]:
+                self._test_transformer_parity_dynamic(
+                    enable_fsdp_fp8_all_gather, pre_compute
+                )
 
-    def _test_transformer_parity_dynamic(self, enable_fsdp_fp8_all_gather: bool):
+    def _test_transformer_parity_dynamic(
+        self, enable_fsdp_fp8_all_gather: bool, pre_compute: Union[str, None]
+    ):
         # NOTE: Weight-tying does not compose with fp8 all-gather because the
         # embedding weight and output linear weight are tied but only the
         # latter uses fp8 compute. With fp8 all-gather, FSDP would pre-cast to
@@ -106,11 +149,25 @@ def _test_transformer_parity_dynamic(self, enable_fsdp_fp8_all_gather: bool):
         ref_optim = torch.optim.Adam(ref_module.parameters(), lr=1e-2)
         optim = torch.optim.Adam(module.parameters(), lr=1e-2, foreach=True)
         local_inp = torch.randint(
-            0, ref_module.tok_embeddings.weight.size(0), (16, 16), device="cuda"
-        )
-        check_parity_no_mp(
-            self, ref_module, ref_optim, module, optim, local_inp, Float8DynamicLinear
+            0, ref_module.tok_embeddings.weight.size(0), (4, 512), device="cuda"
         )
+        with profiler(
+            output_dir=f"./test_fsdp2_eager_fp8_{enable_fsdp_fp8_all_gather}_{pre_compute}_rank_{torch.distributed.get_rank()}.json"
+        ) as prof:
+            for i in range(5):
+                optim.zero_grad()
+                loss = module(local_inp).sum()
+                # if torch.distributed.get_rank() == 0:
+                #     print(f"{pre_compute=} {i=} {loss=}")
+                loss.backward()
+                optim.step()
+                if pre_compute is None:
+                    pass
+                elif pre_compute == "cast":
+                    precompute_float8_weights(module)
+                elif pre_compute == "amax":
+                    precompute_float8_amax(module)
+                prof.step()
 
     @skip_if_lt_x_gpu(2)
     def test_transformer_memory(self):