[ExecuTorch] Add optimized op_linear

swolchok · swolchok · commit a14c5d139c86 · 2024-09-10T16:15:23.000-07:00
If we happen to be running without a delegate, directly implementing linear is much more efficient than permute_copy_out (materialize a transpose) followed by matmul. Differential Revision: [D62154007](https://our.internmc.facebook.com/intern/diff/D62154007/) [ghstack-poisoned]
diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml
@@ -215,6 +215,8 @@
 
 - op: linalg_vector_norm.out
 
+- op: linear.out
+
 - op: log.out
 
 - op: log10.out
diff --git a/kernels/optimized/cpu/op_linear.cpp b/kernels/optimized/cpu/op_linear.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/optimized/blas/CPUBlas.h>
+#include <executorch/kernels/portable/cpu/util/matmul_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#include <array>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+
+Tensor& opt_linear_out(
+    RuntimeContext& ctx,
+    const Tensor& in,
+    const Tensor& mat2,
+    const optional<Tensor>& bias,
+    Tensor& out) {
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      !bias.has_value(),
+      InvalidArgument,
+      out,
+      "bias not supported yet in linear");
+  ET_KERNEL_CHECK(ctx, check_linear_args(in, mat2, out), InvalidArgument, out);
+
+  size_t output_ndim = 0;
+  std::array<exec_aten::SizesType, kTensorDimensionLimit> output_sizes;
+  get_linear_out_target_size(in, mat2, output_sizes.data(), &output_ndim);
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {output_sizes.data(), output_ndim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  // gemm on some platforms doesn't tolerate empty input.
+  if (out.numel() == 0) {
+    return out;
+  }
+
+  int flattened_input_dim = 1;
+  for (int ii = 0; ii < in.dim() - 1; ++ii) {
+    flattened_input_dim *= in.sizes()[ii];
+  }
+  ET_SWITCH_REAL_TYPES_AND2(
+      Half, BFloat16, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() {
+        size_t n = flattened_input_dim;
+        size_t k = in.sizes()[in.dim() - 1];
+        size_t m = mat2.size(0);
+
+        executorch::cpublas::gemm(
+            executorch::cpublas::TransposeType::Transpose,
+            executorch::cpublas::TransposeType::NoTranspose,
+            m,
+            n,
+            k,
+            static_cast<CTYPE>(1),
+            mat2.const_data_ptr<CTYPE>(),
+            k,
+            in.const_data_ptr<CTYPE>(),
+            k,
+            static_cast<CTYPE>(0),
+            out.mutable_data_ptr<CTYPE>(),
+            m);
+      });
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
@@ -40,6 +40,13 @@ _OPTIMIZED_ATEN_OPS = (
             "//executorch/kernels/portable/cpu:scalar_utils",
         ],
     ),
+    op_target(
+        name = "op_linear",
+        deps = [
+            "//executorch/kernels/optimized:libblas",
+            "//executorch/kernels/portable/cpu/util:matmul_ops_util",
+        ],
+    ),
     op_target(
         name = "op_log_softmax",
         deps = select({
diff --git a/kernels/optimized/optimized-oss.yaml b/kernels/optimized/optimized-oss.yaml
@@ -45,6 +45,11 @@
     - arg_meta: null
       kernel_name: torch::executor::opt_le_tensor_out
 
+- op: linear.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_linear_out
+
 - op: mul.out
   kernels:
     - arg_meta: null
diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml
@@ -52,6 +52,11 @@
     - arg_meta: null
       kernel_name: torch::executor::opt_le_tensor_out
 
+- op: linear.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_linear_out
+
 - op: mm.out
   kernels:
     - arg_meta: null
diff --git a/kernels/portable/cpu/util/matmul_ops_util.cpp b/kernels/portable/cpu/util/matmul_ops_util.cpp
@@ -71,6 +71,19 @@ bool check_mm_args(const Tensor& in, const Tensor& mat2, Tensor& out) {
   return true;
 }
 
+bool check_linear_args(const Tensor& in, const Tensor& mat2, Tensor& out) {
+  ET_LOG_AND_RETURN_IF_FALSE(in.dim() == out.dim());
+  ET_LOG_AND_RETURN_IF_FALSE(in.dim() >= 2);
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_is_rank(mat2, 2));
+
+  ET_LOG_AND_RETURN_IF_FALSE(tensors_have_same_dtype(in, mat2, out));
+
+  ET_LOG_AND_RETURN_IF_FALSE(
+      tensors_have_same_size_at_dims(in, in.dim() - 1, mat2, 1));
+
+  return true;
+}
+
 void get_mm_out_target_size(
     const Tensor& mat1,
     const Tensor& mat2,
@@ -81,5 +94,17 @@ void get_mm_out_target_size(
   out_sizes[1] = mat2.size(1);
 }
 
+void get_linear_out_target_size(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    Tensor::SizesType* out_sizes,
+    size_t* out_ndim) {
+  *out_ndim = mat1.dim();
+  for (int ii = 0; ii < mat1.dim() - 1; ++ii) {
+    out_sizes[ii] = mat1.sizes()[ii];
+  }
+  out_sizes[mat1.dim() - 1] = mat2.size(0);
+}
+
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/matmul_ops_util.h b/kernels/portable/cpu/util/matmul_ops_util.h
@@ -37,5 +37,13 @@ void get_mm_out_target_size(
     Tensor::SizesType* out_sizes,
     size_t* out_ndim);
 
+bool check_linear_args(const Tensor& in, const Tensor& mat2, Tensor& out);
+
+void get_linear_out_target_size(
+    const Tensor& mat1,
+    const Tensor& mat2,
+    Tensor::SizesType* out_sizes,
+    size_t* out_ndim);
+
 } // namespace executor
 } // namespace torch
diff --git a/kernels/test/op_linear_test.cpp b/kernels/test/op_linear_test.cpp
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl