Skip to content

Commit c275c64

Browse files
committed
Revert "Add vectorization in elementwise_util (#9432)"
This reverts commit 4c35fe0.
1 parent a2e898e commit c275c64

24 files changed

+41
-370
lines changed

.lintrunner.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -272,8 +272,6 @@ exclude_patterns = [
272272
'exir/verification/bindings.cpp',
273273
'extension/**',
274274
# Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
275-
'kernels/portable/cpu/util/elementwise_util.h',
276-
'kernels/portable/cpu/util/math_util.h',
277275
'kernels/portable/cpu/util/vectorized_math.h',
278276
'kernels/optimized/**',
279277
'runtime/core/exec_aten/**',

kernels/portable/cpu/op_add.cpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -102,18 +102,14 @@ Tensor& add_scalar_out(
102102
static constexpr const char op_name[] = "add.Scalar_out";
103103

104104
ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
105-
CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
106-
CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
107-
auto val_alpha_times_b = val_alpha * val_b;
108105
utils::apply_unitensor_elementwise_fn<
109106
CTYPE_COMPUTE,
110107
op_name,
111108
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
112-
[val_alpha_times_b](const auto val_a) {
113-
// Cast here supports vectorization; either it does nothing
114-
// or it casts from CTYPE_COMPUTE to
115-
// Vectorized<CTYPE_COMPUTE>.
116-
return val_a + decltype(val_a)(val_alpha_times_b);
109+
[b, alpha](const auto val_a) {
110+
CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
111+
CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
112+
return val_a + val_alpha * val_b;
117113
},
118114
ctx,
119115
a,

kernels/portable/cpu/op_atan2.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ Tensor& atan2_out(
6060
op_name,
6161
utils::SupportedTensorDtypes::FLOATHBF16>(
6262
[](const auto val_a, const auto val_b) {
63-
return executorch::math::atan2(val_a, val_b);
63+
return std::atan2(val_a, val_b);
6464
},
6565
ctx,
6666
a,

kernels/portable/cpu/op_clamp.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,9 @@ Tensor& clamp_out(
138138
CTYPE_COMPUTE,
139139
op_name,
140140
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
141-
[has_min, min_opt, has_max, max_opt](const auto val_in) {
142-
auto val_out = val_in;
141+
[has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) {
142+
// TODO: rewrite this to be vectorization-capable.
143+
CTYPE_COMPUTE val_out = val_in;
143144
if (has_min) {
144145
val_out = utils::max_override(
145146
val_out, utils::scalar_to<CTYPE_COMPUTE>(min_opt.value()));

kernels/portable/cpu/op_elu.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ Tensor& elu_out(
4848
CTYPE,
4949
op_name,
5050
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
51-
[negcoef, math_scale, math_input_scale](const CTYPE x) {
51+
[negcoef, math_scale, math_input_scale](const auto x) {
52+
// TODO: rewrite this to be vectorization-capable.
5253
return MathT(x) <= MathT(0)
5354
? std::expm1(MathT(x) * math_input_scale) * negcoef
5455
: MathT(x) * math_scale;

kernels/portable/cpu/op_fmod.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ Tensor& fmod_Tensor_out(
6161
utils::SupportedTensorDtypes::REALHBF16>(
6262
[&div_by_zero_error](
6363
const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
64-
// TODO: rewrite this to be vectorization-capable?
64+
// TODO: rewrite this to be vectorization-capable.
6565
CTYPE_COMPUTE value = 0;
6666
if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
6767
if (val_b == 0) {
@@ -138,8 +138,10 @@ Tensor& fmod_Scalar_out(
138138
CTYPE_COMPUTE,
139139
op_name,
140140
utils::SupportedTensorDtypes::REALHBF16>(
141-
[val_b](const auto val_a) {
142-
return executorch::math::fmod(val_a, (decltype(val_a))val_b);
141+
[val_b](const CTYPE_COMPUTE val_a) {
142+
// TODO: rewrite this to be vectorization-capable.
143+
CTYPE_COMPUTE value = std::fmod(val_a, val_b);
144+
return value;
143145
},
144146
ctx,
145147
a,

kernels/portable/cpu/op_maximum.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ Tensor& maximum_out(
4949
CTYPE_COMPUTE,
5050
op_name,
5151
utils::SupportedTensorDtypes::REALHBBF16>(
52-
[](const auto val_a, const auto val_b) {
52+
[](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
5353
return utils::max_override(val_a, val_b);
5454
},
5555
ctx,

kernels/portable/cpu/op_minimum.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ Tensor& minimum_out(
4949
CTYPE_COMPUTE,
5050
op_name,
5151
utils::SupportedTensorDtypes::REALHBBF16>(
52-
[](const auto val_a, const auto val_b) {
52+
[](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
53+
// TODO: rewrite this to be vectorization-capable.
5354
return utils::min_override(val_a, val_b);
5455
},
5556
ctx,

kernels/portable/cpu/op_mul.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,9 @@ Tensor& mul_out(
7272
CTYPE_COMPUTE,
7373
op_name,
7474
utils::SupportedTensorDtypes::REALHBBF16>(
75-
[](const auto val_a, const auto val_b) { return val_a * val_b; },
75+
[](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
76+
return val_a * val_b;
77+
},
7678
ctx,
7779
a,
7880
utils::SupportedTensorDtypes::REALHBBF16,

kernels/portable/cpu/op_native_dropout.cpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,8 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
5757
}
5858
ET_SWITCH_FLOATHBF16_TYPES(
5959
input.scalar_type(), ctx, op_name, CTYPE_COMPUTE, [&]() {
60-
utils::apply_bitensor_elementwise_fn<
61-
CTYPE_COMPUTE,
62-
op_name,
63-
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
64-
[](const CTYPE_COMPUTE val, const CTYPE_COMPUTE mask_val) {
60+
utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
61+
[](const auto val, const auto mask_val) {
6562
if (!mask_val) {
6663
return static_cast<decltype(val)>(0);
6764
}
@@ -73,7 +70,8 @@ std::tuple<Tensor&, Tensor&> native_dropout_out(
7370
mask,
7471
// TODO: should really be just BOOL
7572
utils::SupportedTensorDtypes::BOOL_OR_BYTE,
76-
out);
73+
out,
74+
utils::SupportedTensorDtypes::SAME_AS_COMMON);
7775
});
7876
} else if (input.numel() > 0) {
7977
std::memcpy(out.mutable_data_ptr(), input.data_ptr(), input.nbytes());

kernels/portable/cpu/op_pow.cpp

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,9 @@ Tensor& pow_Tensor_Tensor_out(
5757
CTYPE_COMPUTE,
5858
op_name,
5959
utils::SupportedTensorDtypes::REALHBF16>(
60-
[](const auto val_a, const auto val_b) {
61-
return executorch::math::pow(val_a, val_b);
60+
[](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
61+
// TODO: rewrite this to be vectorization-capable.
62+
return std::pow(val_a, val_b);
6263
},
6364
ctx,
6465
a,
@@ -110,13 +111,8 @@ Tensor& pow_Tensor_Scalar_out(
110111
CTYPE_COMPUTE,
111112
op_name,
112113
utils::SupportedTensorDtypes::REALHBF16>(
113-
// Casting val_b here supports vectorization; it does
114-
// nothing if we are not vectorizing (casts to
115-
// CTYPE_COMPUTE) and casts to a vectorized type
116-
// otherwise.
117-
[val_b](const auto val_a) {
118-
return executorch::math::pow(val_a, decltype(val_a)(val_b));
119-
},
114+
// TODO: rewrite this to be vectorization-capable.
115+
[val_b](const CTYPE_COMPUTE val_a) { return std::pow(val_a, val_b); },
120116
ctx,
121117
a,
122118
utils::SupportedTensorDtypes::REALHBBF16,
@@ -165,13 +161,8 @@ Tensor& pow_Scalar_out(
165161
CTYPE_COMPUTE,
166162
op_name,
167163
utils::SupportedTensorDtypes::REALHBF16>(
168-
// Casting val_a here supports vectorization; it does
169-
// nothing if we are not vectorizing (casts to
170-
// CTYPE_COMPUTE) and casts to a vectorized type
171-
// otherwise.
172-
[val_a](const auto val_b) {
173-
return executorch::math::pow(decltype(val_b)(val_a), val_b);
174-
},
164+
// TODO: rewrite this to be vectorization-capable.
165+
[val_a](const CTYPE_COMPUTE val_b) { return std::pow(val_a, val_b); },
175166
ctx,
176167
b,
177168
utils::SupportedTensorDtypes::REALHBBF16,

kernels/portable/cpu/op_sigmoid.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,10 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
4949
CTYPE_COMPUTE,
5050
op_name,
5151
utils::SupportedTensorDtypes::FLOATHBF16>(
52-
[](const auto val_in) {
53-
const auto one = static_cast<decltype(val_in)>(1.0);
54-
auto out_val = one / (one + executorch::math::exp(-val_in));
52+
[](const auto val_in) -> CTYPE_COMPUTE {
53+
// TODO: rewrite this to be vectorization-capable
54+
CTYPE_COMPUTE out_val = static_cast<CTYPE_COMPUTE>(1.0) /
55+
(static_cast<CTYPE_COMPUTE>(1.0) + exp(-val_in));
5556
return out_val;
5657
},
5758
ctx,

kernels/portable/cpu/op_where.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,9 @@ Tensor& where_out(
4747
CTYPE_COMPUTE,
4848
op_name,
4949
utils::SupportedTensorDtypes::SAME_AS_COMMON>(
50-
[](const CTYPE_COMPUTE val_a,
51-
const CTYPE_COMPUTE val_b,
52-
const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; },
50+
[](const auto val_a, const auto val_b, const auto val_c) {
51+
return val_c ? val_a : val_b;
52+
},
5353
ctx,
5454
a,
5555
utils::SupportedTensorDtypes::REALHBBF16,

kernels/portable/cpu/util/elementwise_util.h

Lines changed: 1 addition & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,9 @@
1212
#include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
1313
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
1414
#include <executorch/kernels/portable/cpu/util/dtype_util.h>
15-
#include <executorch/kernels/portable/cpu/util/vectorized_math.h> // Make vectorization support easy for clients.
1615
#include <executorch/runtime/kernel/kernel_runtime_context.h>
1716
#include <executorch/runtime/kernel/thread_parallel_interface.h>
1817

19-
#ifdef ET_USE_PYTORCH_HEADERS
20-
#include <ATen/cpu/vec/vec.h>
21-
#endif // ET_USE_PYTORCH_HEADERS
22-
2318
#include <array>
2419
#include <utility>
2520

@@ -56,38 +51,6 @@ inline int64_t scalar_to<int64_t>(const Scalar& s) {
5651
}
5752

5853
namespace internal {
59-
template <typename Ignore, typename T>
60-
using ignore_first_yield_second = T;
61-
62-
#ifdef ET_USE_PYTORCH_HEADERS
63-
// Can I call a function of type Op with sizeof...(Args) arguments of type
64-
// at::vec::Vectorized<CTYPE_COMPUTE>?
65-
//
66-
// See [NOTE: Generic lambdas] below for requirements on Op.
67-
template <typename CTYPE_COMPUTE, typename Op, typename... Args>
68-
constexpr bool can_use_vectorized() {
69-
using Vec = at::vec::Vectorized<CTYPE_COMPUTE>;
70-
// NOTE: if we start building optimized kernels on platforms that
71-
// ATen Vectorized doesn't support well, we will want to add a way
72-
// to check that Vectorized actually does something on our target
73-
// platform. For now, I see no concrete need for that.
74-
if constexpr (std::is_invocable_v<
75-
Op,
76-
ignore_first_yield_second<Args, Vec>...>) {
77-
// For bool, we will get a false positive if we rely on only the
78-
// is_invocable_v check above because at::vec::Vectorized is
79-
// implicitly convertible to a pointer, which makes it implicitly
80-
// convertible to bool (which was 15 minutes of fun to debug). Also
81-
// just seems like good hygiene to make sure we get the Vectorized
82-
// we're expecting.
83-
return std::is_same_v<
84-
std::invoke_result_t<Op, ignore_first_yield_second<Args, Vec>...>,
85-
Vec>;
86-
}
87-
return false;
88-
}
89-
#endif // ET_USE_PYTORCH_HEADERS
90-
9154
template <
9255
typename CTYPE_COMPUTE,
9356
typename CTYPE_OUT,
@@ -98,90 +61,8 @@ inline void dtype_specialized_elementwise_fn_impl(
9861
KernelRuntimeContext& ctx,
9962
const Tensor& out,
10063
Args... inputs) {
101-
static_assert(
102-
(std::is_same_v<Args, std::pair<const Tensor*, SupportedTensorDtypes>> &&
103-
...));
10464
constexpr auto kNumInputs = sizeof...(inputs);
105-
// All inputs must be of type CTYPE_COMPUTE.
106-
ET_DCHECK(
107-
((inputs.first->scalar_type() ==
108-
CppTypeToScalarType<CTYPE_COMPUTE>::value) &&
109-
...));
110-
111-
#ifdef ET_USE_PYTORCH_HEADERS
112-
if constexpr (can_use_vectorized<CTYPE_COMPUTE, Op, Args...>()) {
113-
const bool any_is_broadcasted =
114-
!(torch::executor::internal::sizes_match_ignoring_leading_1s(
115-
inputs.first->sizes(), out.sizes()) &&
116-
...);
117-
if (!any_is_broadcasted) {
118-
using Vec = at::vec::Vectorized<CTYPE_COMPUTE>;
119-
::executorch::extension::parallel_for(
120-
0,
121-
out.numel(),
122-
::executorch::extension::internal::GRAIN_SIZE,
123-
[&](const auto begin, const auto end) {
124-
std::array<const CTYPE_COMPUTE*, kNumInputs> inputs_data_ptrs = {
125-
inputs.first->template const_data_ptr<CTYPE_COMPUTE>()...};
126-
127-
CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
128-
129-
const auto vectorized_begin =
130-
begin + (Vec::size() - begin % Vec::size()) % Vec::size();
131-
const auto vectorized_end = end - (end % Vec::size());
132-
// Scalar prologue.
133-
for (const auto idx : c10::irange(begin, vectorized_begin)) {
134-
// In debug mode, always use Vectorized so that even
135-
// small-sized tests will test whether using Vectorized broke our
136-
// lambda.
137-
#ifndef NDEBUG
138-
std::array<Vec, kNumInputs> loaded_inputs;
139-
#else // NDEBUG
140-
std::array<CTYPE_COMPUTE, kNumInputs> loaded_inputs;
141-
#endif // NDEBUG
142-
for (const auto input_idx : c10::irange(kNumInputs)) {
143-
loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx];
144-
}
145-
#ifndef NDEBUG
146-
std::apply(compute_fun, loaded_inputs).store(&data_out[idx], 1);
147-
#else // NDEBUG
148-
data_out[idx] = std::apply(compute_fun, loaded_inputs);
149-
#endif // NDEBUG
150-
}
151-
152-
// Main vectorized loop.
153-
for (auto idx = vectorized_begin; idx < vectorized_end;
154-
idx += Vec::size()) {
155-
std::array<Vec, kNumInputs> loaded_vec_inputs;
156-
for (const auto input_idx : c10::irange(kNumInputs)) {
157-
loaded_vec_inputs[input_idx] =
158-
Vec::loadu(&inputs_data_ptrs[input_idx][idx]);
159-
}
160-
auto result_vec = std::apply(compute_fun, loaded_vec_inputs);
161-
result_vec.store(&data_out[idx]);
162-
}
163-
164-
// Scalar epilogue.
165-
for (const auto idx : c10::irange(vectorized_end, end)) {
166-
#ifndef NDEBUG
167-
std::array<Vec, kNumInputs> loaded_inputs;
168-
#else // NDEBUG
169-
std::array<CTYPE_COMPUTE, kNumInputs> loaded_inputs;
170-
#endif // NDEBUG
171-
for (const auto input_idx : c10::irange(kNumInputs)) {
172-
loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx];
173-
}
174-
#ifndef NDEBUG
175-
std::apply(compute_fun, loaded_inputs).store(&data_out[idx], 1);
176-
#else // NDEBUG
177-
data_out[idx] = std::apply(compute_fun, loaded_inputs);
178-
#endif // NDEBUG
179-
}
180-
});
181-
return;
182-
}
183-
}
184-
#endif // ET_USE_PYTORCH_HEADERS
65+
ET_DCHECK(((inputs.first->element_size() == sizeof(CTYPE_COMPUTE)) && ...));
18566

18667
::executorch::extension::parallel_for(
18768
0,
@@ -359,19 +240,6 @@ inline void apply_unitensor_elementwise_fn(
359240
compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
360241
}
361242

362-
/**
363-
* Useful for unary elementwise operators. For each element of the
364-
* input, call Op and write to the corresponding element of the
365-
* output. Tensor broadcasting is applied wherever it is required.
366-
*
367-
* [NOTE: Generic lambdas]: If Op is a *generic* lambda (i.e., one with `auto`
368-
* parameters; normal lambdas are fine), it must fulfill one of the
369-
* following conditions. Either:
370-
* 1) It must in fact compile when passed at::vec::Vectorized<CTYPE_COMPUTE>, or
371-
* 2) It must be actively SFINAE-friendly, as per the C++17 examples in
372-
* https://stackoverflow.com/questions/76525790/detecting-if-a-generic-lambda-with-certain-arguments-is-invocable
373-
* .
374-
*/
375243
template <
376244
typename CTYPE_COMPUTE,
377245
const char* op_name,
@@ -413,8 +281,6 @@ inline void apply_bitensor_elementwise_fn(
413281
* Useful for bi-tensor elementwise operators. For each element of the inputs,
414282
* perform a computation and write to the corresponding element of the output.
415283
* Tensor broadcasting is applied wherever it is required.
416-
* See [NOTE: Generic lambdas] if you want to pass a generic lambda for
417-
* compute_fun.
418284
*/
419285
template <
420286
typename CTYPE_COMPUTE,
@@ -481,9 +347,6 @@ inline void apply_tritensor_elementwise_fn(
481347
*
482348
* static constexpr const char op_name[] = "my_op";
483349
* apply_ternary_elementwise_fn<CTYPE_COMPUTE, op_name>.
484-
*
485-
* See [NOTE: Generic lambdas] if you want to pass a generic lambda for
486-
* compute_fun.
487350
*/
488351
template <
489352
typename CTYPE_COMPUTE,

0 commit comments

Comments
 (0)