[BE] Introduce linear_forward_int8 (#432)

malfet · malfet · commit 159c7b5c4363 · 2024-07-16T22:43:50.000-07:00
Similar to `linear_forward_int4` to be replaced with special CPU op later
diff --git a/quantize.py b/quantize.py
@@ -351,6 +351,23 @@ def replace_linear_weight_only_int8_per_channel(
                 child, device, node_type, groupsize
             )
 
+def linear_forward_int8(x, weight, scales):
+    n_groups = scales.numel() // scales.shape[0]
+    # need a formulation / custom op for good performance
+    # on eager, CUDA compiled, CPU compiled and ET exported
+
+    # for now, we special-case channel-wise, because we know how to make that fast (but does not work for groupwise)
+    if n_groups == 1:
+        return F.linear(x, weight.to(dtype=x.dtype)) * scales
+
+    return F.linear(
+        x,
+        (
+            weight.to(dtype=x.dtype).view(weight.shape[0], n_groups, -1)
+            * scales.view(weight.shape[0], n_groups, -1)
+        ).view(weight.shape[0], -1),
+    )
+
 
 class WeightOnlyInt8QuantHandler(QuantHandler):
     def __init__(
@@ -471,25 +488,7 @@ def __init__(
             )
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
-        scales = self.scales
-        weight = self.weight
-        scales = scales.view(scales.shape[0], -1)
-        no_groups = scales.shape[1]
-
-        # need a formulation / custom op for good performance
-        # on eager, CUDA compiled, CPU compiled and ET exported
-
-        # for now, we special-case channel-wise, because we know how to make that fast (but does not work for groupwise)
-        if scales.shape[1] == 1:
-            return F.linear(input, weight.to(dtype=input.dtype)) * self.scales
-        else:
-            return F.linear(
-                input,
-                (
-                    weight.to(dtype=input.dtype).view(weight.shape[0], no_groups, -1)
-                    * scales.view(weight.shape[0], no_groups, -1)
-                ).view(weight.shape[0], -1),
-            )
+        return linear_forward_int8(input, self.weight, self.scales)
 
 
 #########################################################################