Use custom cpp op for packing 4 bit weights (#1899)

kimishpatel · facebook-github-bot · commit 5c6cefcc9b76 · 2024-02-16T09:44:16.000-08:00
Summary: Pull Request resolved: #1899 It is extremely slow otherwise ghstack-source-id: 215452811 exported-using-ghexport Reviewed By: digantdesai Differential Revision: D53594767 fbshipit-source-id: a7af8e4aea86c6ef7dec6036d0257dbc7b323a59
diff --git a/backends/xnnpack/operators/TARGETS b/backends/xnnpack/operators/TARGETS
@@ -10,8 +10,18 @@ runtime.python_library(
         "@EXECUTORCH_CLIENTS",
     ],
     deps = [
+        ":convert_to_qc4w",
         "//executorch/backends/xnnpack/utils:xnnpack_utils",
         "//executorch/exir:graph_module",
         "//executorch/exir/backend:backend_details",
     ],
 )
+
+runtime.cxx_library(
+    name = "convert_to_qc4w",
+    srcs = ["convert_to_qc4w.cpp"],
+    visibility = [
+        "//executorch/...",
+    ],
+    external_deps = ["libtorch"],
+)
diff --git a/backends/xnnpack/operators/convert_to_qc4w.cpp b/backends/xnnpack/operators/convert_to_qc4w.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <torch/library.h>
+
+at::Tensor convert_to_qc4w(at::Tensor x) {
+  std::vector<int64_t> sizes = x.sizes().vec();
+  TORCH_CHECK(sizes.size() == 2, "Expecting 2D tensor");
+  TORCH_CHECK(sizes[1] % 2 == 0);
+  TORCH_CHECK(
+      x.options().dtype() == at::kByte, "Input tensor must be of type uint8.");
+  sizes[1] = sizes[1] / 2;
+  at::Tensor output = at::empty(sizes, x.options().dtype());
+  uint8_t* x_ptr = x.data_ptr<uint8_t>();
+  uint8_t* output_ptr = output.data_ptr<uint8_t>();
+  for (int i = 0; i < output.numel(); ++i) {
+    int32_t input_i = i * 2;
+    int32_t input_i_plus_1 = i * 2 + 1;
+    output_ptr[i] = (x_ptr[input_i_plus_1] << 4) | (x_ptr[input_i]);
+  }
+  return output;
+}
+
+TORCH_LIBRARY_FRAGMENT(xnnpack, m) {
+  m.def("convert_to_qc4w", &convert_to_qc4w);
+}
diff --git a/backends/xnnpack/operators/node_visitor.py b/backends/xnnpack/operators/node_visitor.py
@@ -409,11 +409,19 @@ def convert_to_qc4w(inp: torch.Tensor) -> torch.Tensor:
         ric = int((ic + 1) / 2)
         result = torch.zeros([oc, ric], dtype=torch.uint8)
 
-        for o in range(oc):
-            for i in range(ric):
-                j = 2 * i
-                result[o][i] = inp[o][j]
-                result[o][i] += inp[o][j + 1] << 4
+        try:
+            # TODO(): Enable this in OSS
+            torch.ops.load_library(
+                "//executorch/backends/xnnpack/operators:convert_to_qc4w"
+            )
+            result = torch.ops.xnnpack.convert_to_qc4w(inp)
+        except:
+            # Fallback to python implementation
+            for o in range(oc):
+                for i in range(ric):
+                    j = 2 * i
+                    result[o][i] = inp[o][j]
+                    result[o][i] += inp[o][j + 1] << 4
 
         return result
 
diff --git a/backends/xnnpack/test/TARGETS b/backends/xnnpack/test/TARGETS
@@ -74,3 +74,15 @@ runtime.python_test(
         "//executorch/backends/xnnpack:xnnpack_preprocess",
     ],
 )
+
+runtime.python_test(
+    name = "test_custom_convert_qc4w_op",
+    srcs = ["ops/test_custom_convert_to_qc4w.py"],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/xnnpack/operators:convert_to_qc4w",
+    ],
+    external_deps = [
+        "libtorch",
+    ],
+)
diff --git a/backends/xnnpack/test/ops/test_custom_convert_to_qc4w.py b/backends/xnnpack/test/ops/test_custom_convert_to_qc4w.py
@@ -0,0 +1,56 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+
+
+class TestCustomQC4WConvert(unittest.TestCase):
+    def setUp(self):
+        torch.ops.load_library(
+            "//executorch/backends/xnnpack/operators:convert_to_qc4w"
+        )
+
+    def test_convert(self):
+        def _ref_output(inp):
+            oc, ic = inp.shape
+            if ic % 2 != 0:
+                raise ValueError("Number of input channels not divisible by 2.")
+            ric = (ic + 1) // 2
+            result = torch.zeros([oc, ric], dtype=torch.uint8)
+            for o in range(oc):
+                for i in range(ric):
+                    j = 2 * i
+                    result[o][i] = inp[o][j]
+                    result[o][i] += inp[o][j + 1] << 4
+            return result
+
+        inp = torch.randint(low=0, high=15, size=(20, 42), dtype=torch.uint8)
+        result = torch.ops.xnnpack.convert_to_qc4w(inp)
+        ref_result = _ref_output(inp)
+        assert torch.equal(result, ref_result), "Outputs dont match"
+
+    def test_convert_throws(self):
+        inp = torch.randint(low=0, high=15, size=(20, 41), dtype=torch.uint8)
+        exception_thrown = False
+        # Because for some reason self.assertRaises does not work
+        # and didnt try to debug
+        try:
+            torch.ops.xnnpack.convert_to_qc4w(inp)
+        except:
+            exception_thrown = True
+        self.assertTrue(exception_thrown)
+
+        inp = torch.rand((20, 41))
+        exception_thrown = False
+        # Because for some reason self.assertRaises does not work
+        # and didnt try to debug
+        try:
+            torch.ops.xnnpack.convert_to_qc4w(inp)
+        except:
+            exception_thrown = True
+        self.assertTrue(exception_thrown)