Update on "[executorch] Optimized 2D-by-1D broadcasting in optimized op_mul"

swolchok · swolchok · commit f1730630dc29 · 2024-08-20T14:24:48.000-07:00
Detect that we are doing an elementwise multiplication for a 2D tensor and a 1D tensor. Dispatch to a vectorized kernel for this case. Differential Revision: [D61560826](https://our.internmc.facebook.com/intern/diff/D61560826/) [ghstack-poisoned]
diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
@@ -135,15 +135,6 @@ template <
     typename CTYPE_OUT>
 struct MulInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
     : public ReportCanCastBug {};
-
-Scalar tensor_to_scalar(RuntimeContext& ctx, const Tensor& t) {
-  ET_DCHECK(t.numel() == 1);
-  Scalar result;
-  ET_SWITCH_REALHB_TYPES(t.scalar_type(), ctx, "mul.out", CTYPE, [&]() {
-    result = Scalar(*t.const_data_ptr<CTYPE>());
-  });
-  return result;
-}
 } // namespace
 
 Tensor& opt_mul_out(