pytorch-labs
diff --git a/‎benchmarks/profile_linear_float8.py
Lines changed: 118 additions & 76 deletions b/‎benchmarks/profile_linear_float8.py
Lines changed: 118 additions & 76 deletions
@@ -149,6 +149,22 @@ def forward(self, h):
         return x
 
 
+class SigmoidLinearLNSigmoid(nn.Module):
+    def __init__(self, d1, d2):
+        super().__init__()
+        self.sigmoid1 = nn.Sigmoid()
+        self.fc = nn.Linear(d1, d2)
+        self.ln = nn.LayerNorm(d2)
+        self.sigmoid2 = nn.Sigmoid()
+
+    def forward(self, x):
+        x = self.sigmoid1(x)
+        x = self.fc(x)
+        x = self.ln(x)
+        x = self.sigmoid2(x)
+        return x
+
+
 @dataclass
 class ProfileConfig:
     file_path: Optional[str] = None
@@ -210,7 +226,12 @@ def main(
     model_type: str = "linear",
     dtype_filter: str = "both",
 ):
-    assert model_type in ("linear", "ln_linear", "norm_ffn_norm"), "unsupported"
+    assert model_type in (
+        "linear",
+        "ln_linear",
+        "norm_ffn_norm",
+        "sigmoid_linear_ln_sigmoid",
+    ), "unsupported"
     assert dtype_filter in ("both", "float8", "bfloat16")
 
     scaling_type_x = TensorScalingType(scaling_type_x)
@@ -242,6 +263,12 @@ def main(
         input_tensor = torch.randn(
             1, 8192, 4096, device=device, dtype=ref_dtype
         ).requires_grad_()
+    elif model_type == "sigmoid_linear_ln_sigmoid":
+        bsz, d1, d2 = 4096, 4096, 4096
+        m_ref = SigmoidLinearLNSigmoid(d1, d2)
+        input_tensor = torch.randn(
+            bsz, d1, device=device, dtype=ref_dtype, requires_grad=True
+        )
     else:
         M, K, N = 4 * 4096, 8192, 7168
         m_ref = torch.nn.Sequential(
@@ -258,6 +285,15 @@ def main(
         "scaling_type_w": scaling_type_w,
         "scaling_type_dL_dY": scaling_type_dL_dY,
     }
+    if scaling_type_x is TensorScalingType.STATIC:
+        # for now, dummy scale
+        extra_kwargs["static_scale_x"] = 1.0
+    if scaling_type_w is TensorScalingType.STATIC:
+        # for now, dummy scale
+        extra_kwargs["static_scale_w"] = 1.0
+    if scaling_type_dL_dY is TensorScalingType.STATIC:
+        # for now, dummy scale
+        extra_kwargs["static_scale_dL_dY"] = 1.0
 
     m_float8 = copy.deepcopy(m_ref)
     swap_linear_with_float8_linear(m_float8, **extra_kwargs)
@@ -300,85 +336,91 @@ def float8_forw_backward_wrapper(x):
     # if the `TORCHINDUCTOR_PROFILE` env var is enabled, parse its output
     # to populate triton kernel bandwidth further down in the script
     f = io.StringIO()
-    with redirect_stdout(f):
-        # warm up
-        for _ in range(1):
+    try:
+        with redirect_stdout(f):
+            # warm up
+            for _ in range(1):
+                if dtype_filter != "float8":
+                    ref_forw_backward(input_tensor)
+                if dtype_filter != "bfloat16":
+                    float8_forw_backward_wrapper(input_tensor)
+
+            profile_iters = 5
+            ref_times, float8_times = None, None
+            data = []
+
             if dtype_filter != "float8":
-                ref_forw_backward(input_tensor)
-            if dtype_filter != "bfloat16":
-                float8_forw_backward_wrapper(input_tensor)
-
-        profile_iters = 5
-        ref_times, float8_times = None, None
-        data = []
-
-        if dtype_filter != "float8":
-            # Profile Reference Model
-            print("profiling ref")
-            ref_suffix = f"_{model_type}_ref_compile_{compile}.json"
-            ref_path = profile_path_prefix + ref_suffix
-            profile_config = ProfileConfig(
-                ref_path, ref_suffix, iters=profile_iters, warmup_iters=2, sync=True
-            )
-            p = profile_function(profile_config, ref_forw_backward, input_tensor)
-            print(f"saved {ref_path}")
-            ref_times = profiler_output_to_time_by_kernel_name(p)
-            total_time_ms = sum(v for v in ref_times.values()) / 1e3 / profile_iters
-            for k, v in ref_times.items():
-                v_ms = v / 1e3 / profile_iters
-                data.append(
-                    [
-                        "0_ref",
-                        k,
-                        kernel_name_to_category(k),
-                        v_ms,
-                        v_ms / total_time_ms,
-                        None,
-                    ]
+                # Profile Reference Model
+                print("profiling ref")
+                ref_suffix = f"_{model_type}_ref_compile_{compile}.json"
+                ref_path = profile_path_prefix + ref_suffix
+                profile_config = ProfileConfig(
+                    ref_path, ref_suffix, iters=profile_iters, warmup_iters=2, sync=True
                 )
+                p = profile_function(profile_config, ref_forw_backward, input_tensor)
+                print(f"saved {ref_path}")
+                ref_times = profiler_output_to_time_by_kernel_name(p)
+                total_time_ms = sum(v for v in ref_times.values()) / 1e3 / profile_iters
+                for k, v in ref_times.items():
+                    v_ms = v / 1e3 / profile_iters
+                    data.append(
+                        [
+                            "0_ref",
+                            k,
+                            kernel_name_to_category(k),
+                            v_ms,
+                            v_ms / total_time_ms,
+                            None,
+                        ]
+                    )
 
-        if dtype_filter != "bfloat16":
-            # Profile Float8 Model
-            print("profiling float8")
-            float8_suffix = (
-                f"_{model_type}_float8_compile_{compile}_{scaling_repr}.json"
-            )
-            float8_path = profile_path_prefix + float8_suffix
-            profile_config = ProfileConfig(
-                float8_path,
-                float8_suffix,
-                iters=profile_iters,
-                warmup_iters=2,
-                sync=True,
-            )
-            p = profile_function(
-                profile_config, float8_forw_backward_wrapper, input_tensor
-            )
-            print(f"saved {float8_path}")
-            float8_times = profiler_output_to_time_by_kernel_name(p)
-            total_time_ms = sum(v for v in float8_times.values()) / 1e3 / profile_iters
-            for k, v in float8_times.items():
-                v_ms = v / 1e3 / profile_iters
-                data.append(
-                    [
-                        "1_float8",
-                        k,
-                        kernel_name_to_category(k),
-                        v / 1e3 / profile_iters,
-                        v_ms / total_time_ms,
-                        None,
-                    ]
+            if dtype_filter != "bfloat16":
+                # Profile Float8 Model
+                print("profiling float8")
+                float8_suffix = (
+                    f"_{model_type}_float8_compile_{compile}_{scaling_repr}.json"
                 )
-
-            # get the time spent per user annotation
-            sync_time_us = profiler_output_to_gpu_time_for_key(
-                p, "scale_amax_and_scales"
-            )
-            sync_time_ms = sync_time_us / profile_iters / 1e3
-            print(f"Sync time ms: {sync_time_ms}")
-
-    # print the redirected stdout back to regular stdout
-    print(f.getvalue())
+                float8_path = profile_path_prefix + float8_suffix
+                profile_config = ProfileConfig(
+                    float8_path,
+                    float8_suffix,
+                    iters=profile_iters,
+                    warmup_iters=2,
+                    sync=True,
+                )
+                p = profile_function(
+                    profile_config, float8_forw_backward_wrapper, input_tensor
+                )
+                print(f"saved {float8_path}")
+                float8_times = profiler_output_to_time_by_kernel_name(p)
+                total_time_ms = (
+                    sum(v for v in float8_times.values()) / 1e3 / profile_iters
+                )
+                for k, v in float8_times.items():
+                    v_ms = v / 1e3 / profile_iters
+                    data.append(
+                        [
+                            "1_float8",
+                            k,
+                            kernel_name_to_category(k),
+                            v / 1e3 / profile_iters,
+                            v_ms / total_time_ms,
+                            None,
+                        ]
+                    )
+
+                # get the time spent per user annotation
+                sync_time_us = profiler_output_to_gpu_time_for_key(
+                    p, "scale_amax_and_scales"
+                )
+                sync_time_ms = sync_time_us / profile_iters / 1e3
+                print(f"Sync time ms: {sync_time_ms}")
+
+    finally:
+        # print the redirected stdout back to regular stdout
+        # the finally clause is to help print output in the presence of exceptions,
+        # to aid local debugging
+        print(f.getvalue())
 
     # populate the triton kernel bandwidth
     for line in f.getvalue().split("\n"):