Add scalar cases for add and mul on HiFi

mcremon-meta · facebook-github-bot · commit fb0a6e1f1cde · 2025-03-19T13:27:24.000-07:00
Summary:
As titled. Currently those cases will go to the unoptimized broadcast call, which is extremely inefficient. A simple loop will do much better, and can be further optimized later if needed.
Example of gains: mul op goes from 40M to 123k on the 27M ASR encoder.

Differential Revision: D71495734
diff --git a/backends/cadence/hifi/operators/op_add.cpp b/backends/cadence/hifi/operators/op_add.cpp
@@ -138,8 +138,18 @@ Tensor& add_out(
   if ((out_type != ScalarType::Float) || (alpha_val != 1.0))
     optimized = 0;
 
-  if ((a_dim == 0) || (b_dim == 0))
-    optimized = 0;
+  bool float_types = (a_type == ScalarType::Float) && (b_type == ScalarType::Float);
+
+  if ((a_dim == 0) && float_types) {
+    for (int i = 0; i < max_dim; i++)
+      out.mutable_data_ptr<float>()[i] = a.const_data_ptr<float>()[0] + b.const_data_ptr<float>()[i];
+    return out;
+  }
+  if ((b_dim == 0) && float_types) {
+    for (int i = 0; i < max_dim; i++)
+      out.mutable_data_ptr<float>()[i] = a.const_data_ptr<float>()[i] + b.const_data_ptr<float>()[0];
+    return out;
+  }
 
   if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
     optimized = 0;
diff --git a/backends/cadence/hifi/operators/op_mul.cpp b/backends/cadence/hifi/operators/op_mul.cpp
@@ -104,10 +104,20 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
   max_dim = out.dim() > max_dim ? out.dim() : max_dim;
 
-  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
-    optimized = 0;
+  bool float_types = (a_type == ScalarType::Float) && (b_type == ScalarType::Float);
+
+  if ((a_dim == 0) && float_types) {
+    for (int i = 0; i < max_dim; i++)
+      out.mutable_data_ptr<float>()[i] = a.const_data_ptr<float>()[0] * b.const_data_ptr<float>()[i];
+    return out;
+  }
+  if ((b_dim == 0) && float_types) {
+    for (int i = 0; i < max_dim; i++)
+      out.mutable_data_ptr<float>()[i] = a.const_data_ptr<float>()[i] * b.const_data_ptr<float>()[0];
+    return out;
+  }
 
-  if ((a_dim == 0) || (b_dim == 0))
+  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
     optimized = 0;
 
   if ((broadcast == 1) && (max_dim > kNnlibMaxDim))