create quantized_linear_per_tensor_out in cpu

Zonglin Peng · Zonglin Peng · commit 5ebcf7f03a4f · 2024-12-03T14:17:15.000-08:00
diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
@@ -183,3 +183,8 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::reference::quantized_matmul_out
+
+- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_linear_per_tensor_out
diff --git a/backends/cadence/reference/operators/operators.h b/backends/cadence/reference/operators/operators.h
@@ -0,0 +1,59 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#pragma once
+
+#include <executorch/runtime/core/array_ref.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <optional>
+
+namespace cadence {
+namespace impl {
+namespace cpu {
+namespace native {
+namespace {
+using ::executorch::runtime::getLeadingDims;
+
+
+#define ET_FORALL_CADENCE_QUANTIZED_TYPES(_) \
+  _(uint8_t, Byte)                          \
+  _(int8_t, Char)
+
+inline __attribute__((always_inline)) void linear_(
+    const ::executorch::aten::Tensor& input,
+    const ::executorch::aten::Tensor& weight,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& bias,
+    ::executorch::aten::Tensor& output) {
+  const float* __restrict__ input_data = input.const_data_ptr<float>();
+  const float* __restrict__ weight_data = weight.const_data_ptr<float>();
+  const float* __restrict__ bias_data = bias.value().const_data_ptr<float>();
+  float* __restrict__ output_data = output.mutable_data_ptr<float>();
+
+  // input comes in shape [batch_size, in_dim]
+  // weight comes in shape [out_dim, in_dim]
+  // output comes in empty with shape [batch_size, out_dim]
+  // Perform matrix multiply (M x N) x (N x P) => M x P
+  int64_t M = weight.size(0); // = out_dim
+  int64_t N = weight.size(1); // = in_dim
+
+  // Given an N-dimensional input [d0, d1, d2, ..., d_{N-2}, d_{N-1}], the
+  // leading dimensions is d0 * d1 * ... * d_{N-2}
+  int64_t leading_dims =
+      getLeadingDims(input, input.dim() - 1);
+
+  for (int i = 0; i < leading_dims; ++i) {
+    for (int j = 0; j < M; ++j) {
+      float sum = bias_data[j];
+      for (int k = 0; k < N; ++k) {
+        sum += input_data[i * N + k] * weight_data[j * N + k];
+      }
+      output_data[i * M + j] = sum;
+    }
+  }
+}
+
+} // namespace
+} // namespace native
+} // namespace cpu
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/reference/operators/quantized_linear_out.cpp b/backends/cadence/reference/operators/quantized_linear_out.cpp
@@ -7,6 +7,8 @@
  */
 
 #include <executorch/backends/cadence/reference/kernels/kernels.h>
+#include <executorch/backends/cadence/reference/operators/operators.h>
+#include <executorch/backends/cadence/reference/operators/quantized_ops.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace impl {
@@ -85,6 +87,7 @@ void quantized_linear_out(
     int64_t out_zero_point,
     __ET_UNUSED const executorch::aten::optional<Tensor>& offset,
     Tensor& out) {
+  // TODO: refactor to use switch case as quantized_linear_per_tensor_out
   if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
     _typed_quantized_linear<uint8_t>(
         src,
@@ -115,6 +118,42 @@ void quantized_linear_out(
   }
 }
 
+void quantized_linear_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& src,
+    const Tensor& weight,
+    const Tensor& bias,
+    const int64_t src_zero_point,
+    const int64_t weight_zero_point,
+    const int64_t out_multiplier,
+    const int64_t out_shift,
+    const int64_t out_zero_point,
+    __ET_UNUSED const executorch::aten::optional<Tensor>& offset,
+    Tensor& out) {
+#define typed_quantized_linear_per_tensor(ctype, dtype) \
+  case executorch::aten::ScalarType::dtype: {           \
+    quantized_linear_per_tensor_<ctype>(                \
+        src,                                            \
+        weight,                                         \
+        bias,                                           \
+        src_zero_point,                                 \
+        weight_zero_point,                              \
+        out_multiplier,                                 \
+        out_shift,                                      \
+        out_zero_point,                                 \
+        out);                                           \
+    break;                                              \
+  }
+
+  executorch::aten::ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_linear_per_tensor);
+    default:
+      ET_DCHECK_MSG(false, "Unhandled dtype %s", toString(dtype));
+  }
+#undef typed_quantized_linear_per_tensor
+}
+
 }; // namespace native
 }; // namespace reference
 }; // namespace impl
diff --git a/backends/cadence/reference/operators/quantized_ops.h b/backends/cadence/reference/operators/quantized_ops.h
@@ -0,0 +1,233 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#pragma once
+
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
+#include <executorch/backends/cadence/reference/operators/operators.h>
+
+using executorch::runtime::getLeadingDims;
+
+// Generate kernels that perform elementwise arithmetic on two quantized
+// tensors. The tensors are either the same size, or the second tensor is a
+// scalar.
+#define DECLARE_POINTWISE_TENSOR_QUANTIZED_BINARY_OP(BINARY_FUNC_NAME, OP)    \
+  template <typename T>                                                       \
+  void BINARY_FUNC_NAME(                                                      \
+      const ::executorch::aten::Tensor& X,                                    \
+      float X_scale,                                                          \
+      int32_t X_zero_point,                                                   \
+      const ::executorch::aten::Tensor& Y,                                    \
+      float Y_scale,                                                          \
+      int32_t Y_zero_point,                                                   \
+      float out_scale,                                                        \
+      int32_t out_zero_point,                                                 \
+      ::executorch::aten::Tensor& out) {                                      \
+    const T* __restrict__ X_data = X.const_data_ptr<T>();                     \
+    const T* __restrict__ Y_data = Y.const_data_ptr<T>();                     \
+    T* __restrict__ out_data = out.mutable_data_ptr<T>();                     \
+    size_t Y_numel = Y.numel();                                               \
+    size_t X_numel = X.numel();                                               \
+    float inv_out_scale = 1.0f / out_scale;                                   \
+    /* Tensor that has the same element of X */                               \
+    if (Y_numel == X_numel) {                                                 \
+      for (size_t i = 0; i < X_numel; ++i) {                                  \
+        float x = kernels::dequantize<T>(X_data[i], X_scale, X_zero_point);   \
+        float y = kernels::dequantize<T>(Y_data[i], Y_scale, Y_zero_point);   \
+        float z = x OP y;                                                     \
+        out_data[i] = kernels::quantize<T>(z, inv_out_scale, out_zero_point); \
+      }                                                                       \
+    } /* if Y is a scalar Tensor */                                           \
+    else if (Y_numel == 1) {                                                  \
+      float y = kernels::dequantize<T>(Y_data[0], Y_scale, Y_zero_point);     \
+      for (size_t i = 0; i < X_numel; ++i) {                                  \
+        float x = kernels::dequantize<T>(X_data[i], X_scale, X_zero_point);   \
+        float z = x OP y;                                                     \
+        out_data[i] = kernels::quantize<T>(z, inv_out_scale, out_zero_point); \
+      }                                                                       \
+    } /* other broadcasting cases */                                          \
+    else {                                                                    \
+      ET_DCHECK_MSG(false, "Unsupported broadcasting");                       \
+    }                                                                         \
+  }
+
+template <typename T>
+inline __attribute__((always_inline)) void quantized_linear_per_tensor_(
+    const ::executorch::aten::Tensor& src,
+    const ::executorch::aten::Tensor& weight,
+    const ::executorch::aten::Tensor& bias,
+    const int64_t src_zero_point,
+    const int64_t weight_zero_point,
+    const int64_t out_multiplier,
+    const int64_t out_shift,
+    const int64_t out_zero_point,
+    ::executorch::aten::Tensor& out) {
+  // input comes in shape [leading_dims, in_dim]
+  // weight comes in shape [out_dim, in_dim]
+  // output comes in empty with shape [leading_dims, out_dim]
+  // Perform matrix multiply (M x N) x (N x P)' => M x P
+  const int64_t leading_dims = getLeadingDims(src, src.dim() - 1);
+  const int64_t out_dim = weight.size(0); // = out_dim
+  const int64_t in_dim = weight.size(1); // = in_dim
+
+  const T* __restrict__ in_data = src.const_data_ptr<T>();
+  const T* __restrict__ weight_data = weight.const_data_ptr<T>();
+  const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
+  T* __restrict__ out_data = out.mutable_data_ptr<T>();
+
+  // Compute the requant_scale from out_multiplier and out_shift
+  const float requant_scale =
+      -out_multiplier * 1.0 / (1 << 31) * pow(2, out_shift);
+
+  for (size_t i = 0; i < leading_dims; ++i) {
+    for (size_t j = 0; j < out_dim; ++j) {
+      int32_t sum = bias_data[j];
+      for (size_t k = 0; k < in_dim; ++k) {
+        int32_t x = (int32_t)in_data[i * in_dim + k] - src_zero_point;
+        int32_t w =
+            (int32_t)weight_data[j * in_dim + k] - (int32_t)weight_zero_point;
+        sum += x * w;
+      }
+      out_data[i * out_dim + j] =
+          ::impl::reference::kernels::quantize<T>(sum, requant_scale, out_zero_point);
+    }
+  }
+}
+
+template <typename T>
+inline __attribute__((always_inline)) void quantized_linear_per_tensor_(
+    const ::executorch::aten::Tensor& src,
+    const ::executorch::aten::Tensor& weight,
+    const ::executorch::aten::Tensor& bias,
+    int64_t src_zero_point,
+    const ::executorch::aten::Tensor& weight_zero_point_t,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    int64_t out_zero_point,
+    ::executorch::aten::Tensor& out) {
+  // Get the zero_point of weight.
+  int32_t weight_zero_point = weight_zero_point_t.const_data_ptr<int32_t>()[0];
+  quantized_linear_per_tensor_<T>(
+      src,
+      weight,
+      bias,
+      src_zero_point,
+      weight_zero_point,
+      out_multiplier,
+      out_shift,
+      out_zero_point,
+      out);
+}
+
+template <typename T>
+inline __attribute__((always_inline)) void quantized_linear_per_channel_(
+    const ::executorch::aten::Tensor& src,
+    const ::executorch::aten::Tensor& weight,
+    const ::executorch::aten::Tensor& bias,
+    int64_t src_zero_point,
+    int64_t weight_zero_point,
+    const ::executorch::aten::Tensor& out_multiplier,
+    const ::executorch::aten::Tensor& out_shift,
+    int64_t out_zero_point,
+    ::executorch::aten::Tensor& out) {
+  // input comes in shape [leading_dims, in_dim]
+  // weight comes in shape [out_dim, in_dim]
+  // output comes in empty with shape [leading_dims, out_dim]
+  // Perform matrix multiply (M x N) x (N x P)' => M x P
+  int64_t leading_dims = getLeadingDims(src, src.dim() - 1);
+  const int64_t out_dim = weight.size(0); // = out_dim
+  const int64_t in_dim = weight.size(1); // = in_dim
+
+  const T* __restrict__ in_data = src.const_data_ptr<T>();
+  const T* __restrict__ weight_data = weight.const_data_ptr<T>();
+  const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
+  T* __restrict__ out_data = out.mutable_data_ptr<T>();
+  const int32_t* __restrict__ out_multiplier_data =
+      out_multiplier.const_data_ptr<int32_t>();
+  const int32_t* __restrict__ out_shift_data =
+      out_shift.const_data_ptr<int32_t>();
+
+  for (size_t i = 0; i < leading_dims; ++i) {
+    for (size_t j = 0; j < out_dim; ++j) {
+      int32_t sum = bias_data[j];
+      for (size_t k = 0; k < in_dim; ++k) {
+        int32_t x = (int32_t)in_data[i * in_dim + k] - src_zero_point;
+        int32_t w =
+            (int32_t)weight_data[j * in_dim + k] - (int32_t)weight_zero_point;
+        sum += x * w;
+      }
+      // Compute the out_scale from out_multiplier and out_shift
+      const float out_scale =
+          -out_multiplier_data[j] * 1.0 / (1 << 31) * pow(2, out_shift_data[j]);
+      out_data[i * out_dim + j] =
+          ::impl::reference::kernels::quantize<T>(sum, out_scale, out_zero_point);
+    }
+  }
+}
+
+template <typename T>
+inline __attribute__((always_inline)) void quantized_linear_(
+    const ::executorch::aten::Tensor& src,
+    const ::executorch::aten::Tensor& weight,
+    const ::executorch::aten::Tensor& bias,
+    int64_t src_zero_point,
+    int64_t weight_zero_point,
+    const ::executorch::aten::Tensor& out_multiplier,
+    const ::executorch::aten::Tensor& out_shift,
+    int64_t out_zero_point,
+    ::executorch::aten::Tensor& out) {
+  if (out_multiplier.numel() == 1) {
+    // Use per-tensor quantization kernel.
+    const int32_t* __restrict__ out_multiplier_data =
+        out_multiplier.const_data_ptr<int32_t>();
+    const int32_t* __restrict__ out_shift_data =
+        out_shift.const_data_ptr<int32_t>();
+    quantized_linear_per_tensor_<T>(
+        src,
+        weight,
+        bias,
+        src_zero_point,
+        weight_zero_point,
+        out_multiplier_data[0],
+        out_shift_data[0],
+        out_zero_point,
+        out);
+    return;
+  }
+
+  // Use per-channel quantization kernel.
+  quantized_linear_per_channel_<T>(
+      src,
+      weight,
+      bias,
+      src_zero_point,
+      weight_zero_point,
+      out_multiplier,
+      out_shift,
+      out_zero_point,
+      out);
+}
+
+template <typename T>
+inline __attribute__((always_inline)) void quantized_linear_(
+    const ::executorch::aten::Tensor& src,
+    const ::executorch::aten::Tensor& weight,
+    const ::executorch::aten::Tensor& bias,
+    int64_t src_zero_point,
+    const ::executorch::aten::Tensor& weight_zero_point_t,
+    const ::executorch::aten::Tensor& out_multiplier,
+    const ::executorch::aten::Tensor& out_shift,
+    int64_t out_zero_point,
+    ::executorch::aten::Tensor& out) {
+  // Get the zero_point of weight.
+  int32_t weight_zero_point = weight_zero_point_t.const_data_ptr<int32_t>()[0];
+  quantized_linear_<T>(
+      src,
+      weight,
+      bias,
+      src_zero_point,
+      weight_zero_point,
+      out_multiplier,
+      out_shift,
+      out_zero_point,
+      out);
+}