pytorch
diff --git a/‎.lintrunner.toml
Lines changed: 0 additions & 2 deletions b/‎.lintrunner.toml
Lines changed: 0 additions & 2 deletions
diff --git a/‎kernels/portable/cpu/op_add.cpp
Lines changed: 4 additions & 8 deletions b/‎kernels/portable/cpu/op_add.cpp
Lines changed: 4 additions & 8 deletions
diff --git a/‎kernels/portable/cpu/op_atan2.cpp
Lines changed: 1 addition & 1 deletion b/‎kernels/portable/cpu/op_atan2.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎kernels/portable/cpu/op_clamp.cpp
Lines changed: 3 additions & 2 deletions b/‎kernels/portable/cpu/op_clamp.cpp
Lines changed: 3 additions & 2 deletions
diff --git a/‎kernels/portable/cpu/op_elu.cpp
Lines changed: 2 additions & 1 deletion b/‎kernels/portable/cpu/op_elu.cpp
Lines changed: 2 additions & 1 deletion
diff --git a/‎kernels/portable/cpu/op_fmod.cpp
Lines changed: 5 additions & 3 deletions b/‎kernels/portable/cpu/op_fmod.cpp
Lines changed: 5 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/op_maximum.cpp
Lines changed: 1 addition & 1 deletion b/‎kernels/portable/cpu/op_maximum.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎kernels/portable/cpu/op_minimum.cpp
Lines changed: 2 additions & 1 deletion b/‎kernels/portable/cpu/op_minimum.cpp
Lines changed: 2 additions & 1 deletion
diff --git a/‎kernels/portable/cpu/op_mul.cpp
Lines changed: 3 additions & 1 deletion b/‎kernels/portable/cpu/op_mul.cpp
Lines changed: 3 additions & 1 deletion
diff --git a/‎kernels/portable/cpu/op_native_dropout.cpp
Lines changed: 4 additions & 6 deletions b/‎kernels/portable/cpu/op_native_dropout.cpp
Lines changed: 4 additions & 6 deletions
diff --git a/‎kernels/portable/cpu/op_pow.cpp
Lines changed: 7 additions & 16 deletions b/‎kernels/portable/cpu/op_pow.cpp
Lines changed: 7 additions & 16 deletions
diff --git a/‎kernels/portable/cpu/op_sigmoid.cpp
Lines changed: 4 additions & 3 deletions b/‎kernels/portable/cpu/op_sigmoid.cpp
Lines changed: 4 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/op_where.cpp
Lines changed: 3 additions & 3 deletions b/‎kernels/portable/cpu/op_where.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎kernels/portable/cpu/util/elementwise_util.h
Lines changed: 1 addition & 138 deletions b/‎kernels/portable/cpu/util/elementwise_util.h
Lines changed: 1 addition & 138 deletions
@@ -272,8 +272,6 @@ exclude_patterns = [
     'exir/verification/bindings.cpp',
     'extension/**',
     # Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
-    'kernels/portable/cpu/util/elementwise_util.h',
-    'kernels/portable/cpu/util/math_util.h',
     'kernels/portable/cpu/util/vectorized_math.h',
     'kernels/optimized/**',
     'runtime/core/exec_aten/**',
 
@@ -102,18 +102,14 @@ Tensor& add_scalar_out(
   static constexpr const char op_name[] = "add.Scalar_out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    auto val_alpha_times_b = val_alpha * val_b;
     utils::apply_unitensor_elementwise_fn<
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [val_alpha_times_b](const auto val_a) {
-          // Cast here supports vectorization; either it does nothing
-          // or it casts from CTYPE_COMPUTE to
-          // Vectorized<CTYPE_COMPUTE>.
-          return val_a + decltype(val_a)(val_alpha_times_b);
+        [b, alpha](const auto val_a) {
+          CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
+          CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
+          return val_a + val_alpha * val_b;
         },
         ctx,
         a,
 
@@ -60,7 +60,7 @@ Tensor& atan2_out(
         op_name,
         utils::SupportedTensorDtypes::FLOATHBF16>(
         [](const auto val_a, const auto val_b) {
-          return executorch::math::atan2(val_a, val_b);
+          return std::atan2(val_a, val_b);
         },
         ctx,
         a,
 
@@ -138,8 +138,9 @@ Tensor& clamp_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [has_min, min_opt, has_max, max_opt](const auto val_in) {
-          auto val_out = val_in;
+        [has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) {
+          // TODO: rewrite this to be vectorization-capable.
+          CTYPE_COMPUTE val_out = val_in;
           if (has_min) {
             val_out = utils::max_override(
                 val_out, utils::scalar_to<CTYPE_COMPUTE>(min_opt.value()));
 
@@ -48,7 +48,8 @@ Tensor& elu_out(
         CTYPE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [negcoef, math_scale, math_input_scale](const CTYPE x) {
+        [negcoef, math_scale, math_input_scale](const auto x) {
+          // TODO: rewrite this to be vectorization-capable.
           return MathT(x) <= MathT(0)
               ? std::expm1(MathT(x) * math_input_scale) * negcoef
               : MathT(x) * math_scale;
 
@@ -61,7 +61,7 @@ Tensor& fmod_Tensor_out(
         utils::SupportedTensorDtypes::REALHBF16>(
         [&div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
-          // TODO: rewrite this to be vectorization-capable?
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE value = 0;
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
@@ -138,8 +138,10 @@ Tensor& fmod_Scalar_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBF16>(
-        [val_b](const auto val_a) {
-          return executorch::math::fmod(val_a, (decltype(val_a))val_b);
+        [val_b](const CTYPE_COMPUTE val_a) {
+          // TODO: rewrite this to be vectorization-capable.
+          CTYPE_COMPUTE value = std::fmod(val_a, val_b);
+          return value;
         },
         ctx,
         a,
 
@@ -49,7 +49,7 @@ Tensor& maximum_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBBF16>(
-        [](const auto val_a, const auto val_b) {
+        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
           return utils::max_override(val_a, val_b);
         },
         ctx,
 
@@ -49,7 +49,8 @@ Tensor& minimum_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBBF16>(
-        [](const auto val_a, const auto val_b) {
+        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           return utils::min_override(val_a, val_b);
         },
         ctx,
 
@@ -72,7 +72,9 @@ Tensor& mul_out(
           CTYPE_COMPUTE,
           op_name,
           utils::SupportedTensorDtypes::REALHBBF16>(
-          [](const auto val_a, const auto val_b) { return val_a * val_b; },
+          [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+            return val_a * val_b;
+          },
           ctx,
           a,
           utils::SupportedTensorDtypes::REALHBBF16,
 
@@ -57,11 +57,8 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
     }
     ET_SWITCH_FLOATHBF16_TYPES(
         input.scalar_type(), ctx, op_name, CTYPE_COMPUTE, [&]() {
-          utils::apply_bitensor_elementwise_fn<
-              CTYPE_COMPUTE,
-              op_name,
-              utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-              [](const CTYPE_COMPUTE val, const CTYPE_COMPUTE mask_val) {
+          utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+              [](const auto val, const auto mask_val) {
                 if (!mask_val) {
                   return static_cast<decltype(val)>(0);
                 }
@@ -73,7 +70,8 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
               mask,
               // TODO: should really be just BOOL
               utils::SupportedTensorDtypes::BOOL_OR_BYTE,
-              out);
+              out,
+              utils::SupportedTensorDtypes::SAME_AS_COMMON);
         });
   } else if (input.numel() > 0) {
     std::memcpy(out.mutable_data_ptr(), input.data_ptr(), input.nbytes());
 
@@ -57,8 +57,9 @@ Tensor& pow_Tensor_Tensor_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBF16>(
-        [](const auto val_a, const auto val_b) {
-          return executorch::math::pow(val_a, val_b);
+        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
+          return std::pow(val_a, val_b);
         },
         ctx,
         a,
@@ -110,13 +111,8 @@ Tensor& pow_Tensor_Scalar_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBF16>(
-        // Casting val_b here supports vectorization; it does
-        // nothing if we are not vectorizing (casts to
-        // CTYPE_COMPUTE) and casts to a vectorized type
-        // otherwise.
-        [val_b](const auto val_a) {
-          return executorch::math::pow(val_a, decltype(val_a)(val_b));
-        },
+        // TODO: rewrite this to be vectorization-capable.
+        [val_b](const CTYPE_COMPUTE val_a) { return std::pow(val_a, val_b); },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
@@ -165,13 +161,8 @@ Tensor& pow_Scalar_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBF16>(
-        // Casting val_a here supports vectorization; it does
-        // nothing if we are not vectorizing (casts to
-        // CTYPE_COMPUTE) and casts to a vectorized type
-        // otherwise.
-        [val_a](const auto val_b) {
-          return executorch::math::pow(decltype(val_b)(val_a), val_b);
-        },
+        // TODO: rewrite this to be vectorization-capable.
+        [val_a](const CTYPE_COMPUTE val_b) { return std::pow(val_a, val_b); },
         ctx,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
 
@@ -49,9 +49,10 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::FLOATHBF16>(
-        [](const auto val_in) {
-          const auto one = static_cast<decltype(val_in)>(1.0);
-          auto out_val = one / (one + executorch::math::exp(-val_in));
+        [](const auto val_in) -> CTYPE_COMPUTE {
+          // TODO: rewrite this to be vectorization-capable
+          CTYPE_COMPUTE out_val = static_cast<CTYPE_COMPUTE>(1.0) /
+              (static_cast<CTYPE_COMPUTE>(1.0) + exp(-val_in));
           return out_val;
         },
         ctx,
 
@@ -47,9 +47,9 @@ Tensor& where_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::SAME_AS_COMMON>(
-        [](const CTYPE_COMPUTE val_a,
-           const CTYPE_COMPUTE val_b,
-           const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; },
+        [](const auto val_a, const auto val_b, const auto val_c) {
+          return val_c ? val_a : val_b;
+        },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
 
@@ -12,14 +12,9 @@
 #include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/kernels/portable/cpu/util/dtype_util.h>
-#include <executorch/kernels/portable/cpu/util/vectorized_math.h> // Make vectorization support easy for clients.
 #include <executorch/runtime/kernel/kernel_runtime_context.h>
 #include <executorch/runtime/kernel/thread_parallel_interface.h>
 
-#ifdef ET_USE_PYTORCH_HEADERS
-#include <ATen/cpu/vec/vec.h>
-#endif // ET_USE_PYTORCH_HEADERS
-
 #include <array>
 #include <utility>
 
@@ -56,38 +51,6 @@ inline int64_t scalar_to<int64_t>(const Scalar& s) {
 }
 
 namespace internal {
-template <typename Ignore, typename T>
-using ignore_first_yield_second = T;
-
-#ifdef ET_USE_PYTORCH_HEADERS
-// Can I call a function of type Op with sizeof...(Args) arguments of type
-// at::vec::Vectorized<CTYPE_COMPUTE>?
-//
-// See [NOTE: Generic lambdas] below for requirements on Op.
-template <typename CTYPE_COMPUTE, typename Op, typename... Args>
-constexpr bool can_use_vectorized() {
-  using Vec = at::vec::Vectorized<CTYPE_COMPUTE>;
-  // NOTE: if we start building optimized kernels on platforms that
-  // ATen Vectorized doesn't support well, we will want to add a way
-  // to check that Vectorized actually does something on our target
-  // platform. For now, I see no concrete need for that.
-  if constexpr (std::is_invocable_v<
-                    Op,
-                    ignore_first_yield_second<Args, Vec>...>) {
-    // For bool, we will get a false positive if we rely on only the
-    // is_invocable_v check above because at::vec::Vectorized is
-    // implicitly convertible to a pointer, which makes it implicitly
-    // convertible to bool (which was 15 minutes of fun to debug). Also
-    // just seems like good hygiene to make sure we get the Vectorized
-    // we're expecting.
-    return std::is_same_v<
-        std::invoke_result_t<Op, ignore_first_yield_second<Args, Vec>...>,
-        Vec>;
-  }
-  return false;
-}
-#endif // ET_USE_PYTORCH_HEADERS
-
 template <
     typename CTYPE_COMPUTE,
     typename CTYPE_OUT,
@@ -98,90 +61,8 @@ inline void dtype_specialized_elementwise_fn_impl(
     KernelRuntimeContext& ctx,
     const Tensor& out,
     Args... inputs) {
-  static_assert(
-      (std::is_same_v<Args, std::pair<const Tensor*, SupportedTensorDtypes>> &&
-       ...));
   constexpr auto kNumInputs = sizeof...(inputs);
-  // All inputs must be of type CTYPE_COMPUTE.
-  ET_DCHECK(
-      ((inputs.first->scalar_type() ==
-        CppTypeToScalarType<CTYPE_COMPUTE>::value) &&
-       ...));
-
-#ifdef ET_USE_PYTORCH_HEADERS
-  if constexpr (can_use_vectorized<CTYPE_COMPUTE, Op, Args...>()) {
-    const bool any_is_broadcasted =
-        !(torch::executor::internal::sizes_match_ignoring_leading_1s(
-              inputs.first->sizes(), out.sizes()) &&
-          ...);
-    if (!any_is_broadcasted) {
-      using Vec = at::vec::Vectorized<CTYPE_COMPUTE>;
-      ::executorch::extension::parallel_for(
-          0,
-          out.numel(),
-          ::executorch::extension::internal::GRAIN_SIZE,
-          [&](const auto begin, const auto end) {
-            std::array<const CTYPE_COMPUTE*, kNumInputs> inputs_data_ptrs = {
-                inputs.first->template const_data_ptr<CTYPE_COMPUTE>()...};
-
-            CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
-
-            const auto vectorized_begin =
-                begin + (Vec::size() - begin % Vec::size()) % Vec::size();
-            const auto vectorized_end = end - (end % Vec::size());
-            // Scalar prologue.
-            for (const auto idx : c10::irange(begin, vectorized_begin)) {
-          // In debug mode, always use Vectorized so that even
-          // small-sized tests will test whether using Vectorized broke our
-          // lambda.
-#ifndef NDEBUG
-              std::array<Vec, kNumInputs> loaded_inputs;
-#else // NDEBUG
-              std::array<CTYPE_COMPUTE, kNumInputs> loaded_inputs;
-#endif // NDEBUG
-              for (const auto input_idx : c10::irange(kNumInputs)) {
-                loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx];
-              }
-#ifndef NDEBUG
-              std::apply(compute_fun, loaded_inputs).store(&data_out[idx], 1);
-#else // NDEBUG
-              data_out[idx] = std::apply(compute_fun, loaded_inputs);
-#endif // NDEBUG
-            }
-
-            // Main vectorized loop.
-            for (auto idx = vectorized_begin; idx < vectorized_end;
-                 idx += Vec::size()) {
-              std::array<Vec, kNumInputs> loaded_vec_inputs;
-              for (const auto input_idx : c10::irange(kNumInputs)) {
-                loaded_vec_inputs[input_idx] =
-                    Vec::loadu(&inputs_data_ptrs[input_idx][idx]);
-              }
-              auto result_vec = std::apply(compute_fun, loaded_vec_inputs);
-              result_vec.store(&data_out[idx]);
-            }
-
-            // Scalar epilogue.
-            for (const auto idx : c10::irange(vectorized_end, end)) {
-#ifndef NDEBUG
-              std::array<Vec, kNumInputs> loaded_inputs;
-#else // NDEBUG
-              std::array<CTYPE_COMPUTE, kNumInputs> loaded_inputs;
-#endif // NDEBUG
-              for (const auto input_idx : c10::irange(kNumInputs)) {
-                loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx];
-              }
-#ifndef NDEBUG
-              std::apply(compute_fun, loaded_inputs).store(&data_out[idx], 1);
-#else // NDEBUG
-              data_out[idx] = std::apply(compute_fun, loaded_inputs);
-#endif // NDEBUG
-            }
-          });
-      return;
-    }
-  }
-#endif // ET_USE_PYTORCH_HEADERS
+  ET_DCHECK(((inputs.first->element_size() == sizeof(CTYPE_COMPUTE)) && ...));
 
   ::executorch::extension::parallel_for(
       0,
@@ -359,19 +240,6 @@ inline void apply_unitensor_elementwise_fn(
       compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
 }
 
-/**
- * Useful for unary elementwise operators. For each element of the
- * input, call Op and write to the corresponding element of the
- * output. Tensor broadcasting is applied wherever it is required.
- *
- * [NOTE: Generic lambdas]: If Op is a *generic* lambda (i.e., one with `auto`
- * parameters; normal lambdas are fine), it must fulfill one of the
- * following conditions. Either:
- * 1) It must in fact compile when passed at::vec::Vectorized<CTYPE_COMPUTE>, or
- * 2) It must be actively SFINAE-friendly, as per the C++17 examples in
- * https://stackoverflow.com/questions/76525790/detecting-if-a-generic-lambda-with-certain-arguments-is-invocable
- * .
- */
 template <
     typename CTYPE_COMPUTE,
     const char* op_name,
@@ -413,8 +281,6 @@ inline void apply_bitensor_elementwise_fn(
  * Useful for bi-tensor elementwise operators. For each element of the inputs,
  * perform a computation and write to the corresponding element of the output.
  * Tensor broadcasting is applied wherever it is required.
- * See [NOTE: Generic lambdas] if you want to pass a generic lambda for
- * compute_fun.
  */
 template <
     typename CTYPE_COMPUTE,
@@ -481,9 +347,6 @@ inline void apply_tritensor_elementwise_fn(
  *
  * static constexpr const char op_name[] = "my_op";
  * apply_ternary_elementwise_fn<CTYPE_COMPUTE, op_name>.
- *
- * See [NOTE: Generic lambdas] if you want to pass a generic lambda for
- * compute_fun.
  */
 template <
     typename CTYPE_COMPUTE,