Better 4bit packing (#2649)

digantdesai · facebook-github-bot · commit 781ba314db35 · 2024-03-25T09:09:21.000-07:00
Summary:

Just use tensor methods, drop custom op and the previous python logic

Differential Revision: D55319010
diff --git a/backends/xnnpack/operators/node_visitor.py b/backends/xnnpack/operators/node_visitor.py
@@ -481,34 +481,22 @@ def convert_to_qc4w(inp: torch.Tensor) -> torch.Tensor:
             assert (
                 inp.ndim == 2
             ), f"convert_to_qc4w: expecting input tensor to be 2d, got {inp.ndim}"
-        oc, ic = inp.shape
+        assert inp.ndim == 2, "convert_to_qc4w: expecting input tensor to be 2d"
 
         # pad ic
-        if ic % 2 != 0:
+        if inp.shape[-1] % 2 != 0:
             inp = F.pad(input=inp, pad=(0, 1, 0, 0), mode="constant", value=0)
 
+        # Shape after padding
+        oc, ic = inp.shape
+        assert ic % 2 == 0, "convert_to_qc4w: expecting ic to be even"
+
         # Adjust inp tensor for zp
         inp = inp.to(dtype=torch.uint8) + 8
 
-        # prepare result tensor
-        ric = int((ic + 1) / 2)
-        result = torch.zeros([oc, ric], dtype=torch.uint8)
-
-        try:
-            aot_path = NodeVisitor.find_aot_util_path()
-            torch.ops.load_library(aot_path)
-            result = torch.ops.xnnpack.convert_to_qc4w(inp)
-        except:
-            # Fallback to python implementation
-            # TODO Warn the user? They might be developing in-tree and didn't install,
-            # in which case, this will be very slow for large models.
-            for o in range(oc):
-                for i in range(ric):
-                    j = 2 * i
-                    result[o][i] = inp[o][j]
-                    result[o][i] += inp[o][j + 1] << 4
-
-        return result
+        # Prepare the Result tensor
+        inp = inp.contiguous().view(-1)
+        return (inp[1::2] << 4 | inp[::2]).view(oc, ic / 2)
 
     def get_serialized_buffer_index(
         self,