Use _weight_int8pack_mm for CPU + eager (#472)

malfet · malfet · commit 8605a11b5201 · 2024-07-17T09:55:43.000-07:00
* Use _weight_int8pack_mm for CPU + eager

* Skip for older PyTorch versions
diff --git a/quantize.py b/quantize.py
@@ -351,14 +351,26 @@ def replace_linear_weight_only_int8_per_channel(
                 child, device, node_type, groupsize
             )
 
+
 def linear_forward_int8(x, weight, scales):
     n_groups = scales.numel() // scales.shape[0]
     # need a formulation / custom op for good performance
     # on eager, CUDA compiled, CPU compiled and ET exported
 
     # for now, we special-case channel-wise, because we know how to make that fast (but does not work for groupwise)
     if n_groups == 1:
-        return F.linear(x, weight.to(dtype=x.dtype)) * scales
+        if (
+            torch.compiler.is_compiling()
+            or x.device.type != "cpu"
+            or torch.__version__ < "2.4"
+        ):
+            return F.linear(x, weight.to(dtype=x.dtype)) * scales
+        # Use int8pack_mm for CPU eager
+        return torch.ops.aten._weight_int8pack_mm(
+            x.reshape(-1, x.shape[-1]),
+            weight,
+            scales,
+        ).reshape(x.shape[:-1] + (weight.shape[0],))
 
     return F.linear(
         x,