Add mixed dtype linear (#2023)

Michael Gschwind · facebook-github-bot · commit 8299fe3eacf6 · 2024-03-03T21:42:19.000-08:00
Summary: Pull Request resolved: #2023 Add mixed dtype linear Reviewed By: mavlyutovr, manuelcandales Differential Revision: D53995591 fbshipit-source-id: f68e2fd1254cb3717f2276eef9375c944cb99d60
diff --git a/examples/models/llama2/ops/quantized.yaml b/examples/models/llama2/ops/quantized.yaml
@@ -3,3 +3,9 @@
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::quantized_embedding_byte_out
+
+- func: quantized_decomposed::mixed_linear.out(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::quantized_mixed_linear_out
diff --git a/kernels/portable/cpu/vec_ops.h b/kernels/portable/cpu/vec_ops.h
@@ -10,10 +10,12 @@
 
 #include <algorithm>
 #include <cmath>
+#include <cstdint>
 #include <cstring>
+#include <iostream>
 #include <numeric>
+#include <ostream>
 #include <type_traits>
-
 /**
  * @file
  * This header defines common, low-level operations that can often be
@@ -103,6 +105,40 @@ inline void vec_quantized_matmul_int8(
   }
 }
 
+static inline size_t bounds_min(size_t a, size_t b) {
+  return (a < b) ? a : b;
+}
+
+/// x: m * n, y: p * n, z: m * p, s: p * groups
+/// z[i][j] = sum(x[i][k] * y[j][k] * s[j][k/g])
+template <typename T, typename U = T, typename V = U>
+inline void vec_quantized_matmul_transb_int8(
+    T* __restrict__ z,
+    const U* __restrict__ x,
+    const int8_t* __restrict__ y,
+    const V* __restrict__ s,
+    int64_t m,
+    int64_t n,
+    int64_t p,
+    int64_t g) {
+  int64_t n_over_g = (n + g - 1) / g;
+
+  for (size_t i = 0; i < m; ++i) {
+    for (size_t j = 0; j < p; ++j) {
+      T sum = 0;
+      for (size_t k = 0; k < n; k += g) {
+        T psum = 0;
+        // the last group may have fewer than g elements
+        for (size_t k2 = k; k2 < bounds_min(k + g, n); k2++) {
+          psum += x[i * n + k2] * y[j * n + k2];
+        }
+        sum += psum * s[j * n_over_g + k / g];
+      }
+      z[i * p + j] = sum;
+    }
+  }
+}
+
 // mat1 (m x n), mat2 (n x p), out (m, p), self (m x p)
 // z[i][j] = sum(x[i][k] * y[k][j]), for k in range(n)
 // T for tensor dtype, U for scalar type
diff --git a/kernels/quantized/cpu/op_mixed_linear.cpp b/kernels/quantized/cpu/op_mixed_linear.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/vec_ops.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+
+bool check_quantized_mixed_linear_args(
+    const Tensor& in,
+    const Tensor& weight,
+    const Tensor& weight_scales,
+    const optional<Tensor>& opt_weight_zero_points,
+    const optional<ScalarType> dtype,
+    Tensor& out) {
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(in, 2));
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(weight, 2));
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(weight_scales, 1));
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(out, 2));
+
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_size_at_dims(in, 1, weight, 1));
+  ET_LOG_AND_RETURN_IF_FALSE(
+      tensors_have_same_size_at_dims(weight_scales, 0, weight, 0));
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_size_at_dims(in, 1, weight, 1));
+
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, weight_scales));
+  if (dtype.has_value()) {
+    ET_LOG_AND_RETURN_IF_FALSE(out.scalar_type() == dtype.value());
+    ET_LOG_MSG_AND_RETURN_IF_FALSE(
+        dtype.value() == ScalarType::Float || dtype.value() == ScalarType::Half,
+        "dtype must be Float or Half");
+  }
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      weight.scalar_type() == ScalarType::Char, "weight dtype must be int8");
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      in.scalar_type() == ScalarType::Float ||
+          in.scalar_type() == ScalarType::Half,
+      "input dtype must be Float or Half");
+
+  if (opt_weight_zero_points.has_value()) {
+    ET_LOG_AND_RETURN_IF_FALSE(
+        tensors_have_same_shape(opt_weight_zero_points.value(), weight_scales));
+    ET_LOG_AND_RETURN_IF_FALSE(
+        tensors_have_same_dtype(opt_weight_zero_points.value(), in));
+  }
+
+  // Support for non-null zero points is not implemented yet.
+  ET_LOG_MSG_AND_RETURN_IF_FALSE(
+      !opt_weight_zero_points.has_value(), "zero points not supported yet.");
+  return true;
+}
+
+Tensor& quantized_mixed_linear_out(
+    const Tensor& in,
+    const Tensor& weight,
+    const Tensor& weight_scales,
+    const optional<Tensor>& opt_weight_zero_points,
+    const optional<ScalarType> dtype,
+    Tensor& out) {
+  ET_KERNEL_CHECK(
+      ctx,
+      check_quantized_mixed_linear_args(
+          in, weight, weight_scales, opt_weight_zero_points, dtype, out),
+      InvalidArgument,
+      out);
+
+  ScalarType out_dtype = dtype.has_value() ? dtype.value() : out.scalar_type();
+
+  size_t output_ndim = 2;
+  exec_aten::SizesType output_sizes[kTensorDimensionLimit];
+  output_sizes[0] = in.size(0);
+  output_sizes[1] = weight.size(0);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {output_sizes, output_ndim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  constexpr auto name = "quantized_decomposed::mixed_linear.out";
+
+  ET_SWITCH_TWO_TYPES(Float, Half, in.scalar_type(), ctx, name, CTYPE, [&]() {
+    ET_SWITCH_FLOAT_TYPES_AND(Half, out_dtype, ctx, name, CTYPE_OUT, [&]() {
+      size_t m = in.size(0);
+      size_t n = in.size(1);
+      size_t p = weight.size(0);
+      size_t g = n;
+
+      if (weight_scales.dim() == 2) {
+        g = (n + weight_scales.size(1) - 1) / weight_scales.size(1);
+      };
+
+      // FIXME: this currently ignores dtype
+      vec_quantized_matmul_transb_int8<
+          CTYPE_OUT, // T *z
+          CTYPE>( // U *x, U *s
+          out.mutable_data_ptr<CTYPE_OUT>(),
+          in.const_data_ptr<CTYPE>(),
+          weight.const_data_ptr<int8_t>(),
+          weight_scales.const_data_ptr<CTYPE>(),
+          m,
+          n,
+          p,
+          g);
+    });
+  });
+
+  return out;
+}
+
+Tensor& quantized_mixed_linear_out(
+    RuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& weight,
+    const Tensor& weight_scales,
+    const optional<Tensor>& opt_weight_zero_points,
+    const optional<ScalarType> dtype,
+    Tensor& out) {
+  // TODO(mcandales): Remove the need for this wrapper
+  // TODO(mkg): add support for dtype
+  (void)ctx;
+  return quantized_mixed_linear_out(
+      in, weight, weight_scales, opt_weight_zero_points, dtype, out);
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/quantized/cpu/targets.bzl b/kernels/quantized/cpu/targets.bzl
@@ -29,6 +29,12 @@ _QUANT_OPS = (
             "//executorch/kernels/portable/cpu:vec_ops",
         ],
     ),
+    op_target(
+        name = "op_mixed_linear",
+        deps = [
+            "//executorch/kernels/portable/cpu:vec_ops",
+        ],
+    ),
     op_target(
         name = "op_quantize",
         deps = [
diff --git a/kernels/quantized/quantized.yaml b/kernels/quantized/quantized.yaml
@@ -46,6 +46,12 @@
     - arg_meta: null
       kernel_name: torch::executor::quantized_mixed_mm_out
 
+- func: quantized_decomposed::mixed_linear.out(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::quantized_mixed_linear_out
+
 - func: quantized_decomposed::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels: