[executorch] Add vectorized scalar path for single-element Tensor passed to optimized mul

swolchok · swolchok · commit 452993b3b4ae · 2024-08-20T14:24:47.000-07:00
Pull Request resolved: #4807 We are currently doing slow broadcasting for this case. After this diff, we should get nice vectorization. ghstack-source-id: 239001637 @exported-using-ghexport Differential Revision: [D61560825](https://our.internmc.facebook.com/intern/diff/D61560825/)
diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
@@ -119,6 +119,34 @@ Tensor& opt_mul_out(
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
 
+  if (b.numel() == 1) {
+    if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half) {
+      auto error = resize_tensor(out, a.sizes());
+      ET_KERNEL_CHECK_MSG(
+          ctx,
+          error == Error::Ok,
+          InvalidArgument,
+          out,
+          "Failed to resize output tensor.");
+      ET_SWITCH_REALB_TYPES(a_type, ctx, "mul.out", CTYPE, [&]() {
+        ET_SWITCH_REALB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
+          CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
+          CTYPE b_casted = static_cast<CTYPE>(b_val);
+
+          using Vec = executorch::vec::Vectorized<CTYPE>;
+          executorch::vec::map<CTYPE>(
+              [b_casted](Vec x) { return x * Vec(b_casted); },
+              out.mutable_data_ptr<CTYPE>(),
+              a.const_data_ptr<CTYPE>(),
+              out.numel());
+        });
+      });
+      return out;
+    }
+  } else if (a.numel() == 1) {
+    return opt_mul_out(ctx, b, a, out);
+  }
+
   if (can_use_optimized_path(a, b, out)) {
     // Resize for dynamic shape
     auto error = resize_tensor(out, a.sizes());