Manually apply 4bit weight packing

SS-JIA · web-flow · commit 1e2efa3d861a · 2024-12-12T13:33:51.000-08:00
Differential Revision: D67051119 Pull Request resolved: #7274
diff --git a/backends/vulkan/_passes/int4_weight_only_quantizer.py b/backends/vulkan/_passes/int4_weight_only_quantizer.py
@@ -226,6 +226,12 @@ def _create_quantized_state_dict(
                     self.groupsize,
                     self.precision,  # dtype for scales_and_zeros
                 )
+                # If the packing of 2 4-bit values into a single 8-bit value was not
+                # performed in the previous function call, then do it manually now.
+                if w_int4x8.shape == weight.shape:
+                    w_int4x8 = (w_int4x8[::, ::2] << 4 | w_int4x8[::, 1::2]).to(
+                        torch.uint8
+                    )
                 # In the original implementation, w_int4x8 is packed via calling the
                 # _convert_weight_to_int4pack operator before storing the weight. However
                 # the Vulkan implementation does not expect the weights to be packed, so