Use int4mm weight packing MPS kernel (#866)

manuelcandales · web-flow · commit 81fce9c37782 · 2024-06-26T09:18:32.000-04:00
* Use int4mm weight packing mps kernel

* update torch nightly
diff --git a/install_requirements.sh b/install_requirements.sh
@@ -46,7 +46,7 @@ $PIP_EXECUTABLE install -r requirements.txt --extra-index-url https://download.p
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-NIGHTLY_VERSION=dev20240613
+NIGHTLY_VERSION=dev20240624
 
 # Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
 $PIP_EXECUTABLE uninstall -y triton
diff --git a/quantization/qops.py b/quantization/qops.py
@@ -395,15 +395,9 @@ def _prepare_weight_and_scales_and_zeros(
         weight_int32, scales_and_zeros = group_quantize_tensor(
             weight_bf16, n_bit=4, groupsize=groupsize
         )
-        if weight_bf16.device.type == "mps":
-            # There are still no MPS-accelerated conversion OP
-            weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
-                weight_int32.cpu(), inner_k_tiles
-            ).to("mps")
-        else:
-            weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
-                weight_int32, inner_k_tiles
-            )
+        weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
+            weight_int32, inner_k_tiles
+        )
         return weight_int4pack, scales_and_zeros
 
     @classmethod