Skip to content

[Executorch] Make apple take accelerate path for blas #6603

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion kernels/optimized/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ load(":targets.bzl", "define_common_targets")

oncall("executorch")

define_common_targets()
define_common_targets(True)
32 changes: 25 additions & 7 deletions kernels/optimized/cpu/binary_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,27 @@ enum class ElementwiseOptimizedPath {
kBroadcast2dBy1dReverseArguments,
kBroadcastNdByNd,
kBroadcastNdByNdReverseArguments,
kBroadcastLastDim,
kBroadcastLastDimReverseArguments,
};

namespace internal {

// Find the single broadcast dimension if it exists.
// This path aims to handle broadcast of the following form
// A = [a1, a2,., 1, .., an]
// B = [b1, b2,., bm, .., bn]
// OR
// A = [a1, a2,., am, .., an]
// B = [b1, b2,., 1, .., bn]
/*
Given two tensors, this function returns the broadcast dim if it exists.
Returns 0 if no broadcast dim is found.
Else negative index is used to indicate broadcast dim
e.g. if size = [a, b, c, 1, e, f] then broadcast dim is -3

This path aims to handle broadcast of the following form
A = [a1, a2,., 1, .., an]
B = [b1, b2,., bm, .., bn]
OR
A = [a1, a2,., am, .., an]
B = [b1, b2,., 1, .., bn]
Note that this way of determining broadcast dim also works
when broadcast dim is the last dim.
*/
int32_t inline get_broadcast_dim(const Tensor& lhs, const Tensor& rhs) {
auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs.sizes());
auto lhs_end = lhs.sizes().end();
Expand Down Expand Up @@ -125,6 +135,14 @@ inline ElementwiseOptimizedPath select_broadcast_optimized_path(
} else {
return ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments;
}
} else if (broadcast_dim == -1) {
if (std::count_if(lhs_begin, lhs_end, [](Tensor::SizesType x) {
return x == 1;
}) == 1) {
return ElementwiseOptimizedPath::kBroadcastLastDimReverseArguments;
} else {
return ElementwiseOptimizedPath::kBroadcastLastDim;
}
}
return ElementwiseOptimizedPath::kNone;
}
Expand Down
165 changes: 111 additions & 54 deletions kernels/optimized/cpu/op_mul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <executorch/kernels/optimized/vec/vec.h>
#include <executorch/kernels/portable/cpu/scalar_utils.h>
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
#include <executorch/runtime/core/exec_aten/util/tensor_util.h> // IWYU pragma: export
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/platform/assert.h>

Expand Down Expand Up @@ -66,6 +67,115 @@ template <
typename CTYPE_OUT>
struct MulInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
: public ReportCanCastBug {};

Tensor& handle_last_dim_broadcast(
KernelRuntimeContext& ctx,
const Tensor& a,
const Tensor& b,
Tensor& out,
const ElementwiseOptimizedPath selected_optimized_path) {
ScalarType out_type = out.scalar_type();
const Tensor* lhs;
const Tensor* rhs;
if (selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcastLastDimReverseArguments) {
lhs = &b;
rhs = &a;
} else {
lhs = &a;
rhs = &b;
}
auto error = resize_tensor(out, lhs->sizes());
ET_KERNEL_CHECK_MSG(
ctx,
error == Error::Ok,
InvalidArgument,
out,
"Failed to resize output tensor.");
const size_t outer_size = getLeadingDims(out, out.dim() - 1);
const auto broadcast_size = out.size(out.dim() - 1);
ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::broadcasting_map_broadcast_last_dim<CTYPE>(
[](Vec x, Vec y) { return x * y; },
out.mutable_data_ptr<CTYPE>(),
lhs->const_data_ptr<CTYPE>(),
rhs->const_data_ptr<CTYPE>(),
outer_size,
broadcast_size);
});
return out;
}

Tensor& handle_broadcast_mul(
KernelRuntimeContext& ctx,
const Tensor& a,
const Tensor& b,
Tensor& out,
const ElementwiseOptimizedPath selected_optimized_path) {
if ((selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcastLastDim) ||
(selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcastLastDimReverseArguments)) {
return handle_last_dim_broadcast(ctx, a, b, out, selected_optimized_path);
}

ScalarType out_type = out.scalar_type();
const Tensor* lhs;
const Tensor* rhs;
if ((selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) ||
(selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments)) {
lhs = &b;
rhs = &a;
} else {
// Catch failure to update logic when adding new broadcasting possibility.
ET_DCHECK(
(selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcast2dBy1d) ||
(selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcastNdByNd));
lhs = &a;
rhs = &b;
}
auto error = resize_tensor(out, lhs->sizes());
ET_KERNEL_CHECK_MSG(
ctx,
error == Error::Ok,
InvalidArgument,
out,
"Failed to resize output tensor.");
int64_t outer_size = 1;
int64_t broadcast_size;
int64_t inner_size;
if ((selected_optimized_path == ElementwiseOptimizedPath::kBroadcastNdByNd) ||
(selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments)) {
int32_t broadcast_dim = internal::get_broadcast_dim(*lhs, *rhs);
int32_t broadcast_dim_lhs = lhs->dim() + broadcast_dim;
auto normalized_tensor_size_lhs =
get_normalized_tensor_size(*lhs, broadcast_dim_lhs);
outer_size = normalized_tensor_size_lhs[0];
broadcast_size = normalized_tensor_size_lhs[1];
inner_size = normalized_tensor_size_lhs[2];
} else {
broadcast_size = lhs->sizes()[lhs->dim() - 2];
inner_size = lhs->sizes()[lhs->dim() - 1];
}
ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::broadcasting_map_3d_and_unsqueezed_3d<CTYPE>(
[](Vec x, Vec y) { return x * y; },
out.mutable_data_ptr<CTYPE>(),
lhs->const_data_ptr<CTYPE>(),
rhs->const_data_ptr<CTYPE>(),
outer_size,
broadcast_size,
inner_size);
});
return out;
}
} // namespace

Tensor& opt_mul_out(
Expand Down Expand Up @@ -128,60 +238,7 @@ Tensor& opt_mul_out(
out.numel());
});
} else if (selected_optimized_path != ElementwiseOptimizedPath::kNone) {
const Tensor* lhs;
const Tensor* rhs;
if ((selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcast2dBy1dReverseArguments) ||
(selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments)) {
lhs = &b;
rhs = &a;
} else {
// Catch failure to update logic when adding new broadcasting possibility.
ET_DCHECK(
(selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcast2dBy1d) ||
(selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcastNdByNd));
lhs = &a;
rhs = &b;
}
auto error = resize_tensor(out, lhs->sizes());
ET_KERNEL_CHECK_MSG(
ctx,
error == Error::Ok,
InvalidArgument,
out,
"Failed to resize output tensor.");
int64_t outer_size = 1;
int64_t broadcast_size;
int64_t inner_size;
if ((selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcastNdByNd) ||
(selected_optimized_path ==
ElementwiseOptimizedPath::kBroadcastNdByNdReverseArguments)) {
int32_t broadcast_dim = internal::get_broadcast_dim(*lhs, *rhs);
int32_t broadcast_dim_lhs = lhs->dim() + broadcast_dim;
auto normalized_tensor_size_lhs =
get_normalized_tensor_size(*lhs, broadcast_dim_lhs);
outer_size = normalized_tensor_size_lhs[0];
broadcast_size = normalized_tensor_size_lhs[1];
inner_size = normalized_tensor_size_lhs[2];
} else {
broadcast_size = lhs->sizes()[lhs->dim() - 2];
inner_size = lhs->sizes()[lhs->dim() - 1];
}
ET_SWITCH_REALB_TYPES(out_type, ctx, "mul.out", CTYPE, [&]() {
using Vec = executorch::vec::Vectorized<CTYPE>;
executorch::vec::broadcasting_map_3d_and_unsqueezed_3d<CTYPE>(
[](Vec x, Vec y) { return x * y; },
out.mutable_data_ptr<CTYPE>(),
lhs->const_data_ptr<CTYPE>(),
rhs->const_data_ptr<CTYPE>(),
outer_size,
broadcast_size,
inner_size);
});
return handle_broadcast_mul(ctx, a, b, out, selected_optimized_path);
} else {
ScalarType common_type =
promoteTypes(a_type, b_type, /*half_to_float*/ true);
Expand Down
1 change: 1 addition & 0 deletions kernels/optimized/cpu/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ _OPTIMIZED_ATEN_OPS = (
":binary_ops",
"//executorch/kernels/portable/cpu:scalar_utils",
"//executorch/kernels/portable/cpu/util:broadcast_util",
"//executorch/runtime/core/exec_aten/util:tensor_util",
],
),
op_target(
Expand Down
57 changes: 47 additions & 10 deletions kernels/optimized/lib_defs.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,56 @@ def get_vec_fbcode_preprocessor_flags():
]
return preprocessor_flags

def get_apple_framework_deps_kwargs(is_fbcode):
# various ovr_configs are not available in oss
if not runtime.is_oss and not is_fbcode:
# Jump through few hoops since 'frameworks' is not a valid kwarg
# for some buck rules
frameworks = {'frameworks': select({
"DEFAULT": [],
"ovr_config//os:iphoneos": ["$SDKROOT/System/Library/Frameworks/Accelerate.framework"],
"ovr_config//os:macos-arm64": ["$SDKROOT/System/Library/Frameworks/Accelerate.framework"],
"ovr_config//os:macos-x86_64": ["$SDKROOT/System/Library/Frameworks/Accelerate.framework"],
})}
return frameworks
return {'fbobjc_frameworks': ["Accelerate"]}

def get_preprocessor_flags():
# various ovr_configs are not available in oss
preprocessor_flags = select({
":linux-x86_64": [
"-DET_BUILD_WITH_BLAS",
] if not runtime.is_oss else [],
"DEFAULT": [],
})

if not runtime.is_oss:
# various ovr_configs are not available in oss
additional_preprocessor_flags = select({
"ovr_config//os:iphoneos": [
"-DET_BUILD_WITH_BLAS",
"-DET_BUILD_FOR_APPLE",
] if not runtime.is_oss else [],
"ovr_config//os:macos-arm64": [
"-DET_BUILD_WITH_BLAS",
"-DET_BUILD_FOR_APPLE",
] if not runtime.is_oss else [],
"ovr_config//os:macos-x86_64": [
"-DET_BUILD_WITH_BLAS",
"-DET_BUILD_FOR_APPLE",
] if not runtime.is_oss else [],
"DEFAULT": [],
})
preprocessor_flags = preprocessor_flags + additional_preprocessor_flags
return preprocessor_flags


# Currently, having a dependency on fbsource//third-party/sleef:sleef may cause
# duplicate symbol errors when linking fbcode targets in opt mode that also
# depend on ATen. This is because ATen accesses sleef via the third-party folder
# in caffe2 (caffe2/third-party//sleef:sleef).
# TODO(ssjia): Enable -DCPU_CAPABILITY_AVX2 in fbcode, which requires sleef.
def define_libs():
def define_libs(is_fbcode=False):
runtime.cxx_library(
name = "libvec",
srcs = [],
Expand Down Expand Up @@ -131,12 +175,7 @@ def define_libs():
"//executorch/...",
"@EXECUTORCH_CLIENTS",
],
preprocessor_flags = select({
":linux-x86_64": [
"-DET_BUILD_WITH_BLAS",
] if not runtime.is_oss else [],
"DEFAULT": [],
}),
preprocessor_flags = get_preprocessor_flags(),
fbandroid_platform_preprocessor_flags = [
(
"^android-arm64.*$",
Expand All @@ -157,9 +196,6 @@ def define_libs():
"-DET_BUILD_WITH_BLAS",
"-DET_BUILD_FOR_APPLE",
],
fbobjc_frameworks = [
"Accelerate",
],
deps = select({
":linux-x86_64": [mkl_dep] if not runtime.is_oss else [],
"DEFAULT": [],
Expand All @@ -169,4 +205,5 @@ def define_libs():
"//executorch/kernels/optimized:libutils",
"//executorch/runtime/core/exec_aten:lib",
],
**get_apple_framework_deps_kwargs(is_fbcode),
)
4 changes: 2 additions & 2 deletions kernels/optimized/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
load("@fbsource//xplat/executorch/codegen:codegen.bzl", "et_operator_library", "executorch_generated_lib")
load(":lib_defs.bzl", "define_libs")

def define_common_targets():
def define_common_targets(is_fbcode=False):
"""Defines targets that should be shared between fbcode and xplat.

The directory containing this targets.bzl file should also contain both
TARGETS and BUCK files that call this function.
"""

define_libs()
define_libs(is_fbcode)

runtime.export_file(
name = "optimized.yaml",
Expand Down
38 changes: 38 additions & 0 deletions kernels/optimized/vec/functional_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -380,5 +380,43 @@ inline void broadcasting_map_2d_by_1d(
broadcasting_map_3d_and_unsqueezed_3d(vec_fun, output_data, input_data, input_data2, 1, size, size2);
}

/*
Following function is used to implement broadcasting binary operation on two tensors
where lhs tensor is treated to be of shape [outer_size, broadcast_size] and
rhs tensor is treated to be of shape [outer_size, 1]
Any two N dimensional tensors can be mapped to this formula
when lhs size = [lhs0, lhs1, ..., lhsN-1] and rhs size = [rhs0, rhs1, ..., 1]
by viewing the two tensors as
lhs size = [lsh0 * lsh1 * ... * lshN-2, lhsN-1]
rhs size = [rsh0 * rsh1 * ... * rshN-2, 1]
*/
template <typename scalar_t, typename Op>
inline void broadcasting_map_broadcast_last_dim(
const Op& vec_fun,
scalar_t* output_data,
const scalar_t* lhs,
const scalar_t* rhs,
int64_t outer_size,
int64_t broadcast_size) {
using Vec = vec::Vectorized<scalar_t>;
int64_t outer_stride_lhs = broadcast_size;
for (int64_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
const scalar_t* lhs_outer = lhs + outer_idx * outer_stride_lhs;
scalar_t* output_data_row = output_data + outer_idx * outer_stride_lhs;
int64_t inner_idx = 0;
Vec data_vec2 = Vec(rhs[outer_idx]);
for (; inner_idx < broadcast_size - (broadcast_size % Vec::size()); inner_idx += Vec::size()) {
Vec data_vec = Vec::loadu(lhs_outer + inner_idx);
Vec output_vec = vec_fun(data_vec, data_vec2);
output_vec.store(output_data_row + inner_idx);
}
if (broadcast_size - inner_idx > 0) {
Vec data_vec = Vec::loadu(lhs_outer + inner_idx, broadcast_size - inner_idx);
Vec output_vec = vec_fun(data_vec, data_vec2);
output_vec.store(output_data_row + inner_idx, broadcast_size - inner_idx);
}
}
}

} // namespace vec
} // namespace executorch
Loading
Loading