Use int4mm weight packing mps kernel

manuelcandales · manuelcandales · commit 219fe793222d · 2024-06-25T14:43:05.000-04:00
diff --git a/quantization/qops.py b/quantization/qops.py
@@ -395,15 +395,9 @@ def _prepare_weight_and_scales_and_zeros(
         weight_int32, scales_and_zeros = group_quantize_tensor(
             weight_bf16, n_bit=4, groupsize=groupsize
         )
-        if weight_bf16.device.type == "mps":
-            # There are still no MPS-accelerated conversion OP
-            weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
-                weight_int32.cpu(), inner_k_tiles
-            ).to("mps")
-        else:
-            weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
-                weight_int32, inner_k_tiles
-            )
+        weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
+            weight_int32, inner_k_tiles
+        )
         return weight_int4pack, scales_and_zeros
 
     @classmethod