pytorch
diff --git a/‎kernels/optimized/cpu/binary_ops.h
Lines changed: 91 additions & 0 deletions b/‎kernels/optimized/cpu/binary_ops.h
Lines changed: 91 additions & 0 deletions
diff --git a/‎kernels/optimized/cpu/op_add.cpp
Lines changed: 72 additions & 2 deletions b/‎kernels/optimized/cpu/op_add.cpp
Lines changed: 72 additions & 2 deletions
diff --git a/‎kernels/optimized/cpu/op_div.cpp
Lines changed: 112 additions & 20 deletions b/‎kernels/optimized/cpu/op_div.cpp
Lines changed: 112 additions & 20 deletions
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace internal {
+// NOTE: we bake ArrayRef iterators being pointers into the return
+// type here because we assume that iterators are portable across
+// ArrayRef copies.
+inline const Tensor::SizesType* arrayref_begin_ignoring_leading_1s(
+    ArrayRef<Tensor::SizesType> arr) {
+  return std::find_if(
+      arr.begin(), arr.end(), [](Tensor::SizesType x) { return x != 1; });
+}
+
+inline bool sizes_match_ignoring_leading_1s(
+    ArrayRef<Tensor::SizesType> lhs,
+    ArrayRef<Tensor::SizesType> rhs) {
+  auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs);
+  auto lhs_end = lhs.end();
+
+  auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs);
+  auto rhs_end = rhs.end();
+
+  return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) &&
+      std::equal(lhs_begin, lhs_end, rhs_begin);
+}
+} // namespace internal
+
+enum class ElementwiseOptimizedPath {
+  kNone,
+  kTreatAs1d,
+  kBroadcast2dBy1d,
+  kBroadcast2dBy1dReverseArguments,
+};
+
+namespace internal {
+inline ElementwiseOptimizedPath select_broadcast_2d_by_1d_optimized_path(
+    const Tensor& lhs,
+    const Tensor& rhs) {
+  auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs.sizes());
+  auto lhs_end = lhs.sizes().end();
+
+  auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs.sizes());
+  auto rhs_end = rhs.sizes().end();
+
+  const auto lhs_size = lhs_end - lhs_begin;
+  const auto rhs_size = rhs_end - rhs_begin;
+  if (lhs_size == 2 && rhs_size == 1 && lhs_begin[1] == rhs_begin[0]) {
+    return ElementwiseOptimizedPath::kBroadcast2dBy1d;
+  }
+
+  if (lhs_size == 1 && rhs_size == 2 && rhs_begin[1] == lhs_begin[0]) {
+    return ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments;
+  }
+
+  return ElementwiseOptimizedPath::kNone;
+}
+} // namespace internal
+
+ElementwiseOptimizedPath inline select_optimized_path(
+    const Tensor& a,
+    const Tensor& b,
+    const Tensor& out) {
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+  ScalarType out_type = out.scalar_type();
+
+  if (a_type != b_type || a_type != out_type || a_type == ScalarType::Half) {
+    return ElementwiseOptimizedPath::kNone;
+  }
+  if (a.sizes().equals(b.sizes()) ||
+      (a.numel() == b.numel() &&
+       (a.numel() == out.numel() ||
+        internal::sizes_match_ignoring_leading_1s(a.sizes(), b.sizes())))) {
+    return ElementwiseOptimizedPath::kTreatAs1d;
+  }
+  return internal::select_broadcast_2d_by_1d_optimized_path(a, b);
+}
+
+} // namespace executor
+} // namespace torch
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/kernels/optimized/cpu/binary_ops.h>
 #include <executorch/kernels/optimized/vec/functional.h>
 #include <executorch/kernels/optimized/vec/vec.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
@@ -81,8 +82,41 @@ Tensor& opt_add_out(
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
 
-  if (a_type == b_type && a_type == out_type && a.sizes().equals(b.sizes()) &&
-      a_type != ScalarType::Half) {
+  if (b.numel() == 1) {
+    if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half) {
+      auto error = resize_tensor(out, a.sizes());
+      ET_KERNEL_CHECK_MSG(
+          ctx,
+          error == Error::Ok,
+          InvalidArgument,
+          out,
+          "Failed to resize output tensor.");
+      ET_SWITCH_REALB_TYPES(a_type, ctx, "add.out", CTYPE, [&]() {
+        ET_SWITCH_REALB_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() {
+          CTYPE alpha_val;
+          ET_KERNEL_CHECK(
+              ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
+          CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
+          CTYPE b_casted = static_cast<CTYPE>(b_val);
+
+          using Vec = executorch::vec::Vectorized<CTYPE>;
+          executorch::vec::map<CTYPE>(
+              [alpha_val, b_casted](Vec x) {
+                return x + Vec(alpha_val * b_casted);
+              },
+              out.mutable_data_ptr<CTYPE>(),
+              a.const_data_ptr<CTYPE>(),
+              out.numel());
+        });
+      });
+      return out;
+    }
+  } else if (a.numel() == 1) {
+    return opt_add_out(ctx, b, a, alpha, out);
+  }
+
+  auto selected_optimized_path = select_optimized_path(a, b, out);
+  if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
     // Resize for dynamic shape
     auto error = resize_tensor(out, a.sizes());
     ET_KERNEL_CHECK_MSG(
@@ -105,6 +139,42 @@ Tensor& opt_add_out(
           b.const_data_ptr<CTYPE>(),
           out.numel());
     });
+  } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
+    const Tensor* lhs;
+    const Tensor* rhs;
+    if (selected_optimized_path ==
+        ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
+      lhs = &b;
+      rhs = &a;
+    } else {
+      // Catch failure to update logic when adding new broadcasting possibility.
+      ET_DCHECK(
+          selected_optimized_path ==
+          ElementwiseOptimizedPath::kBroadcast2dBy1d);
+      lhs = &a;
+      rhs = &b;
+    }
+    auto error = resize_tensor(out, lhs->sizes());
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        error == Error::Ok,
+        InvalidArgument,
+        out,
+        "Failed to resize output tensor.");
+    ET_SWITCH_REALB_TYPES(out_type, ctx, "add.out", CTYPE, [&]() {
+      CTYPE alpha_val;
+      ET_KERNEL_CHECK(
+          ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
+
+      using Vec = executorch::vec::Vectorized<CTYPE>;
+      executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
+          [alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; },
+          out.mutable_data_ptr<CTYPE>(),
+          lhs->const_data_ptr<CTYPE>(),
+          rhs->const_data_ptr<CTYPE>(),
+          lhs->sizes()[lhs->dim() - 2],
+          lhs->sizes()[lhs->dim() - 1]);
+    });
   } else {
     ScalarType common_type =
         promoteTypes(a_type, b_type, /*half_to_float*/ true);
 
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/kernels/optimized/cpu/binary_ops.h>
 #include <executorch/kernels/optimized/vec/functional.h>
 #include <executorch/kernels/optimized/vec/vec.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
@@ -48,7 +49,57 @@ Tensor& opt_div_out(
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
 
-  if (a_type == b_type && a_type == out_type && a.sizes().equals(b.sizes())) {
+  if (a.numel() == 1 || b.numel() == 1) {
+    if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half) {
+      const Tensor* tensor;
+      const Tensor* scalar;
+      ScalarType tensor_type;
+      ScalarType scalar_type;
+      if (a.numel() == 1) {
+        tensor = &b;
+        tensor_type = b_type;
+        scalar = &a;
+        scalar_type = a_type;
+      } else {
+        tensor = &a;
+        tensor_type = a_type;
+        scalar = &b;
+        scalar_type = b_type;
+      }
+      auto error = resize_tensor(out, tensor->sizes());
+      ET_KERNEL_CHECK_MSG(
+          ctx,
+          error == Error::Ok,
+          InvalidArgument,
+          out,
+          "Failed to resize output tensor.");
+      ET_SWITCH_REALB_TYPES(tensor_type, ctx, "div.out", CTYPE, [&]() {
+        ET_SWITCH_REALB_TYPES(scalar_type, ctx, "div.out", CTYPE_SCALAR, [&]() {
+          CTYPE_SCALAR scalar_val = *scalar->const_data_ptr<CTYPE_SCALAR>();
+          CTYPE scalar_casted = static_cast<CTYPE>(scalar_val);
+
+          using Vec = executorch::vec::Vectorized<CTYPE>;
+          if (a.numel() == 1) {
+            executorch::vec::map<CTYPE>(
+                [scalar_casted](Vec x) { return Vec(scalar_casted) / x; },
+                out.mutable_data_ptr<CTYPE>(),
+                tensor->const_data_ptr<CTYPE>(),
+                out.numel());
+          } else {
+            executorch::vec::map<CTYPE>(
+                [scalar_casted](Vec x) { return x / Vec(scalar_casted); },
+                out.mutable_data_ptr<CTYPE>(),
+                tensor->const_data_ptr<CTYPE>(),
+                out.numel());
+          }
+        });
+      });
+      return out;
+    }
+  }
+
+  auto selected_optimized_path = select_optimized_path(a, b, out);
+  if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
     // Resize for dynamic shape
     auto error = resize_tensor(out, a.sizes());
     ET_KERNEL_CHECK_MSG(
@@ -67,6 +118,49 @@ Tensor& opt_div_out(
           b.const_data_ptr<CTYPE>(),
           out.numel());
     });
+  } else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
+    const Tensor* lhs;
+    const Tensor* rhs;
+    if (selected_optimized_path ==
+        ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
+      lhs = &b;
+      rhs = &a;
+    } else {
+      // Catch failure to update logic when subing new broadcasting possibility.
+      ET_DCHECK(
+          selected_optimized_path ==
+          ElementwiseOptimizedPath::kBroadcast2dBy1d);
+      lhs = &a;
+      rhs = &b;
+    }
+    auto error = resize_tensor(out, lhs->sizes());
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        error == Error::Ok,
+        InvalidArgument,
+        out,
+        "Failed to resize output tensor.");
+    ET_SWITCH_REALB_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() {
+      using Vec = executorch::vec::Vectorized<CTYPE>;
+      if (selected_optimized_path ==
+          ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
+        executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
+            [](Vec x, Vec y) { return y / x; },
+            out.mutable_data_ptr<CTYPE>(),
+            lhs->const_data_ptr<CTYPE>(),
+            rhs->const_data_ptr<CTYPE>(),
+            lhs->sizes()[lhs->dim() - 2],
+            lhs->sizes()[lhs->dim() - 1]);
+      } else {
+        executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
+            [](Vec x, Vec y) { return x / y; },
+            out.mutable_data_ptr<CTYPE>(),
+            lhs->const_data_ptr<CTYPE>(),
+            rhs->const_data_ptr<CTYPE>(),
+            lhs->sizes()[lhs->dim() - 2],
+            lhs->sizes()[lhs->dim() - 1]);
+      }
+    });
   } else {
     ScalarType common_type = get_compute_type(a_type, b_type);
     ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
@@ -77,25 +171,23 @@ Tensor& opt_div_out(
         InvalidArgument,
         out);
 
-    ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out", CTYPE_A, [&]() {
-      ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out", CTYPE_B, [&]() {
-        ET_SWITCH_REAL_TYPES_AND(
-            Bool, common_type, ctx, "div.out", CTYPE_IN, [&]() {
-              ET_SWITCH_REAL_TYPES_AND(
-                  Bool, out_type, ctx, "div.out", CTYPE_OUT, [&]() {
-                    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
-                        [](const CTYPE_A val_a, const CTYPE_B val_b) {
-                          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
-                          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
-                          CTYPE_IN value = a_casted / b_casted;
-
-                          return static_cast<CTYPE_OUT>(value);
-                        },
-                        a,
-                        b,
-                        out);
-                  });
-            });
+    ET_SWITCH_REALB_TYPES(a_type, ctx, "div.out", CTYPE_A, [&]() {
+      ET_SWITCH_REALB_TYPES(b_type, ctx, "div.out", CTYPE_B, [&]() {
+        ET_SWITCH_REALB_TYPES(common_type, ctx, "div.out", CTYPE_IN, [&]() {
+          ET_SWITCH_REALB_TYPES(out_type, ctx, "div.out", CTYPE_OUT, [&]() {
+            apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+                [](const CTYPE_A val_a, const CTYPE_B val_b) {
+                  CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+                  CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+                  CTYPE_IN value = a_casted / b_casted;
+
+                  return static_cast<CTYPE_OUT>(value);
+                },
+                a,
+                b,
+                out);
+          });
+        });
       });
     });
   }