[executorch] Add vectorized scalar path for single-element Tensor passed to optimized mul

swolchok · swolchok · commit 82ff5ec1a61d · 2024-08-20T14:04:58.000-07:00
We are currently doing slow broadcasting for this case. After this diff, we should get nice vectorization. Differential Revision: [D61560825](https://our.internmc.facebook.com/intern/diff/D61560825/) [ghstack-poisoned]
diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
@@ -106,6 +106,15 @@ template <
     typename CTYPE_OUT>
 struct MulInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
     : public ReportCanCastBug {};
+
+Scalar tensor_to_scalar(RuntimeContext& ctx, const Tensor& t) {
+  ET_DCHECK(t.numel() == 1);
+  Scalar result;
+  ET_SWITCH_REALHB_TYPES(t.scalar_type(), ctx, "mul.out", CTYPE, [&]() {
+    result = Scalar(*t.const_data_ptr<CTYPE>());
+  });
+  return result;
+}
 } // namespace
 
 Tensor& opt_mul_out(
@@ -119,6 +128,35 @@ Tensor& opt_mul_out(
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
 
+  if (b.numel() == 1) {
+    if (a_type == b_type && a_type == out_type &&
+        a_type != ScalarType::Half) {
+      auto error = resize_tensor(out, a.sizes());
+      ET_KERNEL_CHECK_MSG(
+          ctx,
+          error == Error::Ok,
+          InvalidArgument,
+          out,
+          "Failed to resize output tensor.");
+      ET_SWITCH_REALB_TYPES(a_type, ctx, "mul.out", CTYPE, [&]() {
+        ET_SWITCH_REALB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
+          CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
+          CTYPE b_casted = static_cast<CTYPE>(b_val);
+
+          using Vec = executorch::vec::Vectorized<CTYPE>;
+          executorch::vec::map<CTYPE>(
+              [b_casted](Vec x) { return x * Vec(b_casted); },
+              out.mutable_data_ptr<CTYPE>(),
+              a.const_data_ptr<CTYPE>(),
+              out.numel());
+        });
+      });
+      return out;
+    }
+  } else if (a.numel() == 1) {
+    return opt_mul_out(ctx, b, a, out);
+  }
+
   if (can_use_optimized_path(a, b, out)) {
     // Resize for dynamic shape
     auto error = resize_tensor(out, a.sizes());