Skip to content

Propagate mul optimizations from D61504544/D61560825/D61560826 to add/sub/div #4966

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions kernels/optimized/cpu/binary_ops.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <executorch/runtime/kernel/kernel_includes.h>

namespace torch {
namespace executor {
namespace internal {
// NOTE: we bake ArrayRef iterators being pointers into the return
// type here because we assume that iterators are portable across
// ArrayRef copies.
inline const Tensor::SizesType* arrayref_begin_ignoring_leading_1s(
ArrayRef<Tensor::SizesType> arr) {
return std::find_if(
arr.begin(), arr.end(), [](Tensor::SizesType x) { return x != 1; });
}

inline bool sizes_match_ignoring_leading_1s(
ArrayRef<Tensor::SizesType> lhs,
ArrayRef<Tensor::SizesType> rhs) {
auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs);
auto lhs_end = lhs.end();

auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs);
auto rhs_end = rhs.end();

return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) &&
std::equal(lhs_begin, lhs_end, rhs_begin);
}
} // namespace internal

enum class ElementwiseOptimizedPath {
kNone,
kTreatAs1d,
kBroadcast2dBy1d,
kBroadcast2dBy1dReverseArguments,
};

namespace internal {
inline ElementwiseOptimizedPath select_broadcast_2d_by_1d_optimized_path(
const Tensor& lhs,
const Tensor& rhs) {
auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs.sizes());
auto lhs_end = lhs.sizes().end();

auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs.sizes());
auto rhs_end = rhs.sizes().end();

const auto lhs_size = lhs_end - lhs_begin;
const auto rhs_size = rhs_end - rhs_begin;
if (lhs_size == 2 && rhs_size == 1 && lhs_begin[1] == rhs_begin[0]) {
return ElementwiseOptimizedPath::kBroadcast2dBy1d;
}

if (lhs_size == 1 && rhs_size == 2 && rhs_begin[1] == lhs_begin[0]) {
return ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments;
}

return ElementwiseOptimizedPath::kNone;
}
} // namespace internal

ElementwiseOptimizedPath inline select_optimized_path(
const Tensor& a,
const Tensor& b,
const Tensor& out) {
ScalarType a_type = a.scalar_type();
ScalarType b_type = b.scalar_type();
ScalarType out_type = out.scalar_type();

if (a_type != b_type || a_type != out_type || a_type == ScalarType::Half) {
return ElementwiseOptimizedPath::kNone;
}
if (a.sizes().equals(b.sizes()) ||
(a.numel() == b.numel() &&
(a.numel() == out.numel() ||
internal::sizes_match_ignoring_leading_1s(a.sizes(), b.sizes())))) {
return ElementwiseOptimizedPath::kTreatAs1d;
}
return internal::select_broadcast_2d_by_1d_optimized_path(a, b);
}

} // namespace executor
} // namespace torch
74 changes: 72 additions & 2 deletions kernels/optimized/cpu/op_add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/kernels/optimized/cpu/binary_ops.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
Expand Down Expand Up @@ -81,8 +82,41 @@ Tensor& opt_add_out(
ScalarType b_type = b.scalar_type();
ScalarType out_type = out.scalar_type();

if (a_type == b_type && a_type == out_type && a.sizes().equals(b.sizes()) &&
a_type != ScalarType::Half) {
if (b.numel() == 1) {
if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half) {
auto error = resize_tensor(out, a.sizes());
ET_KERNEL_CHECK_MSG(
ctx,
error == Error::Ok,
InvalidArgument,
out,
"Failed to resize output tensor.");
ET_SWITCH_REALB_TYPES(a_type, ctx, "add.out", CTYPE, [&]() {
ET_SWITCH_REALB_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() {
CTYPE alpha_val;
ET_KERNEL_CHECK(
ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
CTYPE_B b_val = *b.const_data_ptr<CTYPE_B>();
CTYPE b_casted = static_cast<CTYPE>(b_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::map<CTYPE>(
[alpha_val, b_casted](Vec x) {
return x + Vec(alpha_val * b_casted);
},
out.mutable_data_ptr<CTYPE>(),
a.const_data_ptr<CTYPE>(),
out.numel());
});
});
return out;
}
} else if (a.numel() == 1) {
return opt_add_out(ctx, b, a, alpha, out);
}

auto selected_optimized_path = select_optimized_path(a, b, out);
if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
// Resize for dynamic shape
auto error = resize_tensor(out, a.sizes());
ET_KERNEL_CHECK_MSG(
Expand All @@ -105,6 +139,42 @@ Tensor& opt_add_out(
b.const_data_ptr<CTYPE>(),
out.numel());
});
} else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
const Tensor* lhs;
const Tensor* rhs;
if (selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
lhs = &b;
rhs = &a;
} else {
// Catch failure to update logic when adding new broadcasting possibility.
ET_DCHECK(
selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcast2dBy1d);
lhs = &a;
rhs = &b;
}
auto error = resize_tensor(out, lhs->sizes());
ET_KERNEL_CHECK_MSG(
ctx,
error == Error::Ok,
InvalidArgument,
out,
"Failed to resize output tensor.");
ET_SWITCH_REALB_TYPES(out_type, ctx, "add.out", CTYPE, [&]() {
CTYPE alpha_val;
ET_KERNEL_CHECK(
ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );

using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
[alpha_val](Vec x, Vec y) { return x + Vec(alpha_val) * y; },
out.mutable_data_ptr<CTYPE>(),
lhs->const_data_ptr<CTYPE>(),
rhs->const_data_ptr<CTYPE>(),
lhs->sizes()[lhs->dim() - 2],
lhs->sizes()[lhs->dim() - 1]);
});
} else {
ScalarType common_type =
promoteTypes(a_type, b_type, /*half_to_float*/ true);
Expand Down
132 changes: 112 additions & 20 deletions kernels/optimized/cpu/op_div.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/kernels/optimized/cpu/binary_ops.h>
#include <executorch/kernels/optimized/vec/functional.h>
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
Expand Down Expand Up @@ -48,7 +49,57 @@ Tensor& opt_div_out(
ScalarType b_type = b.scalar_type();
ScalarType out_type = out.scalar_type();

if (a_type == b_type && a_type == out_type && a.sizes().equals(b.sizes())) {
if (a.numel() == 1 || b.numel() == 1) {
if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half) {
const Tensor* tensor;
const Tensor* scalar;
ScalarType tensor_type;
ScalarType scalar_type;
if (a.numel() == 1) {
tensor = &b;
tensor_type = b_type;
scalar = &a;
scalar_type = a_type;
} else {
tensor = &a;
tensor_type = a_type;
scalar = &b;
scalar_type = b_type;
}
auto error = resize_tensor(out, tensor->sizes());
ET_KERNEL_CHECK_MSG(
ctx,
error == Error::Ok,
InvalidArgument,
out,
"Failed to resize output tensor.");
ET_SWITCH_REALB_TYPES(tensor_type, ctx, "div.out", CTYPE, [&]() {
ET_SWITCH_REALB_TYPES(scalar_type, ctx, "div.out", CTYPE_SCALAR, [&]() {
CTYPE_SCALAR scalar_val = *scalar->const_data_ptr<CTYPE_SCALAR>();
CTYPE scalar_casted = static_cast<CTYPE>(scalar_val);

using Vec = executorch::vec::Vectorized<CTYPE>;
if (a.numel() == 1) {
executorch::vec::map<CTYPE>(
[scalar_casted](Vec x) { return Vec(scalar_casted) / x; },
out.mutable_data_ptr<CTYPE>(),
tensor->const_data_ptr<CTYPE>(),
out.numel());
} else {
executorch::vec::map<CTYPE>(
[scalar_casted](Vec x) { return x / Vec(scalar_casted); },
out.mutable_data_ptr<CTYPE>(),
tensor->const_data_ptr<CTYPE>(),
out.numel());
}
});
});
return out;
}
}

auto selected_optimized_path = select_optimized_path(a, b, out);
if (selected_optimized_path == ElementwiseOptimizedPath::kTreatAs1d) {
// Resize for dynamic shape
auto error = resize_tensor(out, a.sizes());
ET_KERNEL_CHECK_MSG(
Expand All @@ -67,6 +118,49 @@ Tensor& opt_div_out(
b.const_data_ptr<CTYPE>(),
out.numel());
});
} else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
const Tensor* lhs;
const Tensor* rhs;
if (selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
lhs = &b;
rhs = &a;
} else {
// Catch failure to update logic when subing new broadcasting possibility.
ET_DCHECK(
selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcast2dBy1d);
lhs = &a;
rhs = &b;
}
auto error = resize_tensor(out, lhs->sizes());
ET_KERNEL_CHECK_MSG(
ctx,
error == Error::Ok,
InvalidArgument,
out,
"Failed to resize output tensor.");
ET_SWITCH_REALB_TYPES(out_type, ctx, "sub.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
if (selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) {
executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
[](Vec x, Vec y) { return y / x; },
out.mutable_data_ptr<CTYPE>(),
lhs->const_data_ptr<CTYPE>(),
rhs->const_data_ptr<CTYPE>(),
lhs->sizes()[lhs->dim() - 2],
lhs->sizes()[lhs->dim() - 1]);
} else {
executorch::vec::broadcasting_map_2d_by_1d<CTYPE>(
[](Vec x, Vec y) { return x / y; },
out.mutable_data_ptr<CTYPE>(),
lhs->const_data_ptr<CTYPE>(),
rhs->const_data_ptr<CTYPE>(),
lhs->sizes()[lhs->dim() - 2],
lhs->sizes()[lhs->dim() - 1]);
}
});
} else {
ScalarType common_type = get_compute_type(a_type, b_type);
ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
Expand All @@ -77,25 +171,23 @@ Tensor& opt_div_out(
InvalidArgument,
out);

ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out", CTYPE_A, [&]() {
ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out", CTYPE_B, [&]() {
ET_SWITCH_REAL_TYPES_AND(
Bool, common_type, ctx, "div.out", CTYPE_IN, [&]() {
ET_SWITCH_REAL_TYPES_AND(
Bool, out_type, ctx, "div.out", CTYPE_OUT, [&]() {
apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
[](const CTYPE_A val_a, const CTYPE_B val_b) {
CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
CTYPE_IN value = a_casted / b_casted;

return static_cast<CTYPE_OUT>(value);
},
a,
b,
out);
});
});
ET_SWITCH_REALB_TYPES(a_type, ctx, "div.out", CTYPE_A, [&]() {
ET_SWITCH_REALB_TYPES(b_type, ctx, "div.out", CTYPE_B, [&]() {
ET_SWITCH_REALB_TYPES(common_type, ctx, "div.out", CTYPE_IN, [&]() {
ET_SWITCH_REALB_TYPES(out_type, ctx, "div.out", CTYPE_OUT, [&]() {
apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
[](const CTYPE_A val_a, const CTYPE_B val_b) {
CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
CTYPE_IN value = a_casted / b_casted;

return static_cast<CTYPE_OUT>(value);
},
a,
b,
out);
});
});
});
});
}
Expand Down
Loading
Loading