add PrepareFloat8ModuleInput for sequence parallel (#275)

wanchaol · facebook-github-bot · commit 7c7cbae21d76 · 2024-06-12T15:23:40.000-07:00
Summary: when applying Sequence Parallel to a module with more than 2 linear layers for input proj, we often want to transform from Shard to Replicate once (allgather once) and then reuse the allgathered result, for fp8 we would need to do the casting before the shard -> replicate so that we can perform the fp8 allgather. This PR subclasses the PrepareModuleInput to add the fp8 casting logic to make sure we run the fp8 allgather instead of bf16 allgather then do the casting for computation. Also adjust the test cases to test the real ffn case for sequence parallel torchtitan perf benchmarks (8 H100 devgpu, Llama3 8b, 2-way DP, 4-way TP): * eager (with no fp8 allgather): 3265 wps * eager (with fp8 allgather, this PR): 3900 wps * compile (without fp8 allgather): 5850 wps * compile (with fp8 allgather): 6592 wps, with 37% MFU on H100 So even in eager we got around 20% perf improvement with every allgather runs in fp8, and compiled fp8 allgather perf is more than doubled (102% more WPS) :) Pull Request resolved: #275 Reviewed By: vkuzo Differential Revision: D58346331 Pulled By: wanchaol fbshipit-source-id: 008ca49b6aa6973d2f6d6165e13088d6571cabb4
diff --git a/float8_experimental/float8_tensor_parallel.py b/float8_experimental/float8_tensor_parallel.py
@@ -1,11 +1,16 @@
+import torch
 import torch.nn as nn
 from float8_experimental.float8_dynamic_linear import (
     cast_to_float8_e4m3fn,
     cast_to_float8_e5m2_bw,
 )
 from torch.distributed._tensor import DTensor
 from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    PrepareModuleInput,
+    RowwiseParallel,
+)
 
 # subclass the ColwiseParallel and RowwiseParallel classes
 # to add the float8 support
@@ -109,3 +114,93 @@ def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
             )
 
         return super()._apply(module, device_mesh)
+
+
+class PrepareFloat8ModuleInput(PrepareModuleInput):
+    # subclass the PrepareModuleInput classes to implement fp8 specific logic, the only difference is that
+    # after we prepare the input DTensor, we cast the input to DTensor(Float8Tensor)
+    # This is to ensure the float8 cast happens before the all-gather (i.e. Shard -> Replicate)
+    # so that if there are multiple float8 users of the input activation, we perform fp8 allgather
+    # only once.
+    # FP8 Args:
+    #   float8_dtype (torch.dtype, optional): control what float8 dtype to cast to when prepare the module input,
+    #       we currently only support torch.float8_e4m3fn. default: torch.float8_e4m3fn
+    #   fwd_config_submodule_fqn (str, optional): the fqn of the submodule that contains the forward config used
+    #       for the float8 cast. If not specified, we will search for the Float8DynamicLinear in the submodules
+    #       and use the forward config from that module, in this case all module's forward config must be
+    #       the same.
+
+    def __init__(
+        self,
+        *,
+        input_layouts=None,
+        desired_input_layouts=None,
+        input_kwarg_layouts=None,
+        desired_input_kwarg_layouts=None,
+        use_local_output=False,
+        float8_dtype=torch.float8_e4m3fn,
+        fwd_config_submodule_fqn=None,
+    ):
+        super().__init__(
+            input_layouts=input_layouts,
+            desired_input_layouts=desired_input_layouts,
+            input_kwarg_layouts=input_kwarg_layouts,
+            desired_input_kwarg_layouts=desired_input_kwarg_layouts,
+            use_local_output=use_local_output,
+        )
+
+        # fp8 specific fields
+        self.float8_dtype = float8_dtype
+        self.fwd_config_submodule_fqn = fwd_config_submodule_fqn
+
+        if self.float8_dtype != torch.float8_e4m3fn:
+            raise NotImplementedError(
+                "PrepareFloat8ModuleInput only support casting to float8_e4m3fn for now"
+            )
+
+    def _prepare_input_arg(self, input, mesh, input_layout, desired_layout):
+        if input_layout is not None:
+            if isinstance(input, DTensor):
+                # TODO: re-enable the check once we fix the compile path
+                # assert inp.placements[0] == input_layout
+                dt_inp = input
+            else:
+                assert isinstance(
+                    input, torch.Tensor
+                ), "expecting input to be a torch.Tensor!"
+                dt_inp = DTensor.from_local(
+                    input, mesh, (input_layout,), run_check=False
+                )
+
+            dt_inp = cast_to_float8_e4m3fn(
+                dt_inp, self.fwd_linear_config
+            )  # DTensor(Float8Tensor)
+            if desired_layout is not None and input_layout != desired_layout:
+                dt_inp = dt_inp.redistribute(placements=(desired_layout,))
+
+            return dt_inp.to_local() if self.use_local_output else dt_inp
+        else:
+            return input
+
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        from float8_experimental.float8_dynamic_linear import Float8DynamicLinear
+
+        fwd_linear_config = None
+        if self.fwd_config_submodule_fqn is not None:
+            fwd_linear = module.get_submodule(self.fwd_config_submodule_fqn)
+            assert isinstance(fwd_linear, Float8DynamicLinear)
+            fwd_linear_config = fwd_linear.forward_config
+        else:
+            # search for ScaledMM configs for all the submodules and make sure they are the same
+            for mod in module.modules():
+                if isinstance(mod, Float8DynamicLinear):
+                    if fwd_linear_config is None:
+                        fwd_linear_config = mod.forward_config
+                    else:
+                        assert (
+                            fwd_linear_config == mod.forward_config
+                        ), "All the Float8DynamicLinear modules should have same forward config!"
+
+        self.fwd_linear_config = fwd_linear_config
+        super()._apply(module, device_mesh)
+        return module
diff --git a/test/test_dtensor.py b/test/test_dtensor.py
@@ -12,6 +12,7 @@
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 from float8_experimental.float8_dynamic_linear import (
     Float8DynamicLinear,
@@ -22,6 +23,7 @@
 from float8_experimental.float8_tensor_parallel import (
     Float8ColwiseParallel,
     Float8RowwiseParallel,
+    PrepareFloat8ModuleInput,
 )
 from float8_experimental.float8_utils import tensor_to_scale
 from torch.distributed._tensor import distribute_tensor, DTensor, Replicate, Shard
@@ -38,17 +40,26 @@ def setup_distributed():
     return device_mesh
 
 
-class ToyModel(nn.Module):
+class FeedForward(nn.Module):
     """MLP based model"""
 
+    def __init__(self):
+        super(FeedForward, self).__init__()
+        self.w1 = nn.Linear(16, 32, bias=False)
+        self.w2 = nn.Linear(16, 32, bias=False)
+        self.out_proj = nn.Linear(32, 16, bias=False)
+
+    def forward(self, x):
+        return self.out_proj(F.silu(self.w1(x)) * self.w2(x))
+
+
+class ToyModel(nn.Module):
     def __init__(self):
         super(ToyModel, self).__init__()
-        self.in_proj = nn.Linear(16, 32)
-        self.relu = nn.ReLU()
-        self.out_proj = nn.Linear(32, 16)
+        self.ffn = FeedForward()
 
     def forward(self, x):
-        return self.out_proj(self.relu(self.in_proj(x)))
+        return self.ffn(x)
 
 
 def test_scaled_mm(mesh: DeviceMesh, size=16):
@@ -182,8 +193,9 @@ def test_fp8_mlp_tensor_parallelism_base(
         tp_model,
         mesh,
         {
-            "in_proj": Float8ColwiseParallel(),
-            "out_proj": Float8RowwiseParallel(),
+            "ffn.w1": Float8ColwiseParallel(),
+            "ffn.w2": Float8ColwiseParallel(),
+            "ffn.out_proj": Float8RowwiseParallel(),
         },
     )
 
@@ -192,17 +204,46 @@ def test_fp8_mlp_tensor_parallelism_base(
         sp_model,
         mesh,
         {
-            "in_proj": Float8ColwiseParallel(input_layouts=Shard(0)),
-            "out_proj": Float8RowwiseParallel(
-                output_layouts=Shard(0), use_local_output=False
+            "ffn": PrepareFloat8ModuleInput(
+                input_layouts=Shard(1), desired_input_layouts=Replicate()
+            ),
+            "ffn.w1": Float8ColwiseParallel(),
+            "ffn.w2": Float8ColwiseParallel(),
+            "ffn.out_proj": Float8RowwiseParallel(
+                output_layouts=Shard(1), use_local_output=False
+            ),
+        },
+    )
+
+    # PrepareFloat8ModuleInput with specific submodule fqn
+    sp_model2 = copy.deepcopy(toy_model)
+    sp_model2 = swap_linear_with_float8_linear(
+        sp_model2, Float8DynamicLinear, emulate=True
+    )
+
+    sp_model2 = parallelize_module(
+        sp_model2,
+        mesh,
+        {
+            "ffn": PrepareFloat8ModuleInput(
+                input_layouts=Shard(1),
+                desired_input_layouts=Replicate(),
+                fwd_config_submodule_fqn="w2",
+            ),
+            "ffn.w1": Float8ColwiseParallel(),
+            "ffn.w2": Float8ColwiseParallel(),
+            "ffn.out_proj": Float8RowwiseParallel(
+                output_layouts=Shard(1), use_local_output=False
             ),
         },
     )
 
     if compile:
         tp_model = torch.compile(tp_model)
+        sp_model = torch.compile(sp_model)
+        sp_model2 = torch.compile(sp_model2)
 
-    x_fp32 = torch.rand(size * 2, size, device=device, requires_grad=False)
+    x_fp32 = torch.rand(size, size * 2, size, device=device, requires_grad=False)
     x_fp32_tp_input = x_fp32.clone()
     x_fp32_sp_input = distribute_tensor(x_fp32.clone(), mesh, [Shard(0)])
 
@@ -214,11 +255,19 @@ def test_fp8_mlp_tensor_parallelism_base(
     global_out.sum().backward()
     torch.testing.assert_close(tp_out, global_out)
     torch.testing.assert_close(sp_out.full_tensor(), global_out)
+    torch.testing.assert_close(tp_model.ffn.w1.weight.grad, sp_model.ffn.w1.weight.grad)
+    torch.testing.assert_close(
+        tp_model.ffn.out_proj.weight.grad, sp_model.ffn.out_proj.weight.grad
+    )
+
+    sp_out2 = sp_model2(x_fp32_sp_input)
+    sp_out2.sum().backward()
+    torch.testing.assert_close(sp_out2.full_tensor(), global_out)
     torch.testing.assert_close(
-        tp_model.in_proj.weight.grad, sp_model.in_proj.weight.grad
+        tp_model.ffn.w1.weight.grad, sp_model2.ffn.w1.weight.grad
     )
     torch.testing.assert_close(
-        tp_model.out_proj.weight.grad, sp_model.out_proj.weight.grad
+        tp_model.ffn.out_proj.weight.grad, sp_model2.ffn.out_proj.weight.grad
     )