make sync have a compiled inner func

drisspg · drisspg · commit ea368d1b381c · 2024-02-22T08:52:25.000-08:00
diff --git a/float8_experimental/float8_linear_utils.py b/float8_experimental/float8_linear_utils.py
@@ -185,102 +185,110 @@ def sync_float8_amax_and_scale_history(model: torch.nn.Module, fp8_layers=None)
         )
         return
 
-    # Loop over all fp8 layers and grab the needed tensors
-    fp8_amax_x_tensor_list = [None] * len(fp8_layers)
-    fp8_amax_w_tensor_list = [None] * len(fp8_layers)
-    fp8_amax_dL_dY_tensor_list = [None] * len(fp8_layers)
-
-    fp8_x_amax_history_stack = [None] * len(fp8_layers)
-    fp8_w_amax_history_stack = [None] * len(fp8_layers)
-    fp8_dL_dY_amax_history_stack = [None] * len(fp8_layers)
-
-    x_dtypes = set()
-    scale_fn_recipes = set()
-
-    for idx, child in enumerate(fp8_layers):
-        fp8_amax_x_tensor_list[idx] = child.fp8_amax_x
-        fp8_amax_w_tensor_list[idx] = child.fp8_amax_w
-        fp8_amax_dL_dY_tensor_list[idx] = child.fp8_amax_dL_dY
-
-        fp8_x_amax_history_stack[idx] = child.fp8_amax_history_x
-        fp8_w_amax_history_stack[idx] = child.fp8_amax_history_w
-        fp8_dL_dY_amax_history_stack[idx] = child.fp8_amax_history_dL_dY
-
-        x_dtypes.add(child.last_seen_input_dtype)
-        scale_fn_recipes.add(child.recipe.scale_fn_name)
-
-    # TODO This way to get the activation dtype is not ideal
-    if len(x_dtypes) != 1:
-        raise ValueError(
-            f"All layers must have the same last seen input_dtype, got {x_dtypes}"
-        )
-    x_dtype = next(iter(x_dtypes))
+    def inner_func():
+        # Loop over all fp8 layers and grab the needed tensors
+        fp8_amax_x_tensor_list = [None] * len(fp8_layers)
+        fp8_amax_w_tensor_list = [None] * len(fp8_layers)
+        fp8_amax_dL_dY_tensor_list = [None] * len(fp8_layers)
 
-    if len(scale_fn_recipes) != 1:
-        raise ValueError(
-            f"All layers must have the same scale_fn recipe, got {scale_fn_recipes}"
-        )
-    scale_fn_recipe = next(iter(scale_fn_recipes))
+        fp8_x_amax_history_stack = [None] * len(fp8_layers)
+        fp8_w_amax_history_stack = [None] * len(fp8_layers)
+        fp8_dL_dY_amax_history_stack = [None] * len(fp8_layers)
 
-    assert (
-        len(fp8_amax_x_tensor_list)
-        == len(fp8_amax_w_tensor_list)
-        == len(fp8_amax_dL_dY_tensor_list)
-    ), "Mismatched lengths of amax tensors."
-
-    if dist.is_initialized():
-        # Combine all the amax tensors into one tensor and reduce it
-        all_amax_tensors = torch.cat(
-            fp8_amax_x_tensor_list + fp8_amax_w_tensor_list + fp8_amax_dL_dY_tensor_list
+        x_dtypes = set()
+        scale_fn_recipes = set()
+
+        for idx, child in enumerate(fp8_layers):
+            fp8_amax_x_tensor_list[idx] = child.fp8_amax_x
+            fp8_amax_w_tensor_list[idx] = child.fp8_amax_w
+            fp8_amax_dL_dY_tensor_list[idx] = child.fp8_amax_dL_dY
+
+            fp8_x_amax_history_stack[idx] = child.fp8_amax_history_x
+            fp8_w_amax_history_stack[idx] = child.fp8_amax_history_w
+            fp8_dL_dY_amax_history_stack[idx] = child.fp8_amax_history_dL_dY
+
+            x_dtypes.add(child.last_seen_input_dtype)
+            scale_fn_recipes.add(child.recipe.scale_fn_name)
+
+        # TODO This way to get the activation dtype is not ideal
+        if len(x_dtypes) != 1:
+            raise ValueError(
+                f"All layers must have the same last seen input_dtype, got {x_dtypes}"
+            )
+        x_dtype = next(iter(x_dtypes))
+
+        if len(scale_fn_recipes) != 1:
+            raise ValueError(
+                f"All layers must have the same scale_fn recipe, got {scale_fn_recipes}"
+            )
+        scale_fn_recipe = next(iter(scale_fn_recipes))
+
+        assert (
+            len(fp8_amax_x_tensor_list)
+            == len(fp8_amax_w_tensor_list)
+            == len(fp8_amax_dL_dY_tensor_list)
+        ), "Mismatched lengths of amax tensors."
+
+        if dist.is_initialized():
+            # Combine all the amax tensors into one tensor and reduce it
+            all_amax_tensors = torch.cat(
+                fp8_amax_x_tensor_list
+                + fp8_amax_w_tensor_list
+                + fp8_amax_dL_dY_tensor_list
+            )
+            all_reduced_amax_tensor = all_reduce(
+                all_amax_tensors, "MAX", list(range(dist.get_world_size()))
+            )
+            if isinstance(all_reduced_amax_tensor, AsyncCollectiveTensor):
+                all_reduced_amax_tensor = all_reduced_amax_tensor.wait()
+
+            (
+                reduced_fp8_amax_tensor,
+                reduced_fp8_amax_w_tensor,
+                reduced_fp8_amax_dL_dY_tensor,
+            ) = torch.split(all_reduced_amax_tensor, len(fp8_amax_x_tensor_list))
+
+            for idx, child in enumerate(fp8_layers):
+                child.fp8_amax_x.copy_(reduced_fp8_amax_tensor[idx])
+                child.fp8_amax_w.copy_(reduced_fp8_amax_w_tensor[idx])
+                child.fp8_amax_dL_dY.copy_(reduced_fp8_amax_dL_dY_tensor[idx])
+
+        # We create two stacked tensor groups, one for the amax history and one for the current scales
+        fp8_amax_x_tensors = torch.vstack(fp8_amax_x_tensor_list)
+        fp8_amax_w_tensors = torch.vstack(fp8_amax_w_tensor_list)
+        fp8_amax_dL_dY_tensors = torch.vstack(fp8_amax_dL_dY_tensor_list)
+
+        fp8_x_amax_history_stack = torch.vstack(fp8_x_amax_history_stack)
+        fp8_w_amax_history_stack = torch.vstack(fp8_w_amax_history_stack)
+        fp8_dL_dY_amax_history_stack = torch.vstack(fp8_dL_dY_amax_history_stack)
+
+        # Update the history stacks with the new amax values
+        _update_history_stack(fp8_amax_x_tensors, fp8_x_amax_history_stack)
+        _update_history_stack(fp8_amax_w_tensors, fp8_w_amax_history_stack)
+        _update_history_stack(fp8_amax_dL_dY_tensors, fp8_dL_dY_amax_history_stack)
+
+        # Calculate the new scales from the updated history stacks
+        new_x_scales = amax_history_to_scale_stack(
+            fp8_x_amax_history_stack, torch.float8_e4m3fn, x_dtype, scale_fn_recipe
         )
-        all_reduced_amax_tensor = all_reduce(
-            all_amax_tensors, "MAX", list(range(dist.get_world_size()))
+        new_w_scales = amax_history_to_scale_stack(
+            fp8_w_amax_history_stack, torch.float8_e4m3fn, x_dtype, scale_fn_recipe
+        )
+        new_dL_dY_scales = amax_history_to_scale_stack(
+            fp8_dL_dY_amax_history_stack, torch.float8_e5m2, x_dtype, scale_fn_recipe
         )
-        if isinstance(all_reduced_amax_tensor, AsyncCollectiveTensor):
-            all_reduced_amax_tensor = all_reduced_amax_tensor.wait()
-
-        (
-            reduced_fp8_amax_tensor,
-            reduced_fp8_amax_w_tensor,
-            reduced_fp8_amax_dL_dY_tensor,
-        ) = torch.split(all_reduced_amax_tensor, len(fp8_amax_x_tensor_list))
 
+        # Iterate through the layers and update the scales, and set the flag to signal that the amaxes/scales are ready
         for idx, child in enumerate(fp8_layers):
-            child.fp8_amax_x.copy_(reduced_fp8_amax_tensor[idx])
-            child.fp8_amax_w.copy_(reduced_fp8_amax_w_tensor[idx])
-            child.fp8_amax_dL_dY.copy_(reduced_fp8_amax_dL_dY_tensor[idx])
-
-    # We create two stacked tensor groups, one for the amax history and one for the current scales
-    fp8_amax_x_tensors = torch.vstack(fp8_amax_x_tensor_list)
-    fp8_amax_w_tensors = torch.vstack(fp8_amax_w_tensor_list)
-    fp8_amax_dL_dY_tensors = torch.vstack(fp8_amax_dL_dY_tensor_list)
-
-    fp8_x_amax_history_stack = torch.vstack(fp8_x_amax_history_stack)
-    fp8_w_amax_history_stack = torch.vstack(fp8_w_amax_history_stack)
-    fp8_dL_dY_amax_history_stack = torch.vstack(fp8_dL_dY_amax_history_stack)
-
-    # Update the history stacks with the new amax values
-    _update_history_stack(fp8_amax_x_tensors, fp8_x_amax_history_stack)
-    _update_history_stack(fp8_amax_w_tensors, fp8_w_amax_history_stack)
-    _update_history_stack(fp8_amax_dL_dY_tensors, fp8_dL_dY_amax_history_stack)
-
-    # Calculate the new scales from the updated history stacks
-    new_x_scales = amax_history_to_scale_stack(
-        fp8_x_amax_history_stack, torch.float8_e4m3fn, x_dtype, scale_fn_recipe
-    )
-    new_w_scales = amax_history_to_scale_stack(
-        fp8_w_amax_history_stack, torch.float8_e4m3fn, x_dtype, scale_fn_recipe
-    )
-    new_dL_dY_scales = amax_history_to_scale_stack(
-        fp8_dL_dY_amax_history_stack, torch.float8_e5m2, x_dtype, scale_fn_recipe
-    )
+            child.fp8_scale_x.copy_(new_x_scales[idx])
+            child.fp8_scale_w.copy_(new_w_scales[idx])
+            child.fp8_scale_dL_dY.copy_(new_dL_dY_scales[idx])
 
-    # Iterate through the layers and update the scales, and set the flag to signal that the amaxes/scales are ready
-    for idx, child in enumerate(fp8_layers):
-        child.fp8_scale_x.copy_(new_x_scales[idx])
-        child.fp8_scale_w.copy_(new_w_scales[idx])
-        child.fp8_scale_dL_dY.copy_(new_dL_dY_scales[idx])
+    # When cuda-graphs work we should compile with "reduce-overhead"
+    compiled_inner_func = torch.compile(inner_func)
+    compiled_inner_func()
 
+    for child in fp8_layers:
         # 4. set a flag to signal amaxes/scales are ready
         # We only update the flag if we know it will be checked by the modules
         if fp8_config.enable_amax_init: