Skip to content

Commit 979708d

Browse files
dijopaulcad-audio
andauthored
Updating cadence ops with new name space, rebasing 6 optimized ops (#6407)
* Main backup (#12) * Add nnlib as submodule * Adding nnlib submodule * Integrated nnlib API unde backends/cadence/hifi * Fix review comments on PR#3 * Add nnlib as submodule * Adding nnlib submodule * Integrated nnlib API unde backends/cadence/hifi * Fix review comments on PR#3 * Incorporated feedback from Meta team. * lint errors fixed * Adding Sub operator optimized version * Add optimization for add, mul operators * Adding Div operator * Modified div mod to cover truncate and floor modes --------- Co-authored-by: cad-audio <[email protected]> Co-authored-by: cad-audio <[email protected]> * Adding sigmoid optimizations * Adding tanh optimizations * Fixing review comments in 5483 * Adding cflags to prevent compilation halts * Adding cflags to prevent compilation halts * Changing name space of optimized ops; Remove unused ops from file * fixed lint issues. * Namespace updates for cadence ops, adding 6 optimized ops --------- Co-authored-by: cad-audio <[email protected]> Co-authored-by: cad-audio <[email protected]>
1 parent 5a34bc1 commit 979708d

16 files changed

+3595
-12
lines changed

1.txt

Lines changed: 676 additions & 0 deletions
Large diffs are not rendered by default.

backends/cadence/aot/functions_hifi.yaml

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
- op: add.out
2626
kernels:
2727
- arg_meta: null
28-
kernel_name: torch::executor::add_out
28+
kernel_name: impl::HiFi::add_out
2929

3030
- op: bmm.out
3131
kernels:
@@ -45,12 +45,12 @@
4545
- op: div.out
4646
kernels:
4747
- arg_meta: null
48-
kernel_name: torch::executor::div_out
48+
kernel_name: cadence::impl::HiFi::div_out
4949

5050
- op: div.out_mode
5151
kernels:
5252
- arg_meta: null
53-
kernel_name: torch::executor::div_out_mode
53+
kernel_name: cadence::impl::HiFi::div_out_mode
5454

5555
- op: embedding.out
5656
kernels:
@@ -65,7 +65,7 @@
6565
- op: mul.out
6666
kernels:
6767
- arg_meta: null
68-
kernel_name: torch::executor::mul_out
68+
kernel_name: cadence::impl::HiFi::mul_out
6969

7070
- op: permute_copy.out
7171
kernels:
@@ -75,7 +75,7 @@
7575
- op: sigmoid.out
7676
kernels:
7777
- arg_meta: null
78-
kernel_name: torch::executor::sigmoid_out
78+
kernel_name: cadence::impl::HiFi::sigmoid_out
7979

8080
- op: slice_copy.Tensor_out
8181
kernels:
@@ -90,7 +90,12 @@
9090
- op: sub.out
9191
kernels:
9292
- arg_meta: null
93-
kernel_name: torch::executor::sub_out
93+
kernel_name: cadence::impl::HiFi::sub_out
94+
95+
- op: tanh.out
96+
kernels:
97+
- arg_meta: null
98+
kernel_name: cadence::impl::HiFi::tanh_out
9499

95100
- op: view_copy.out
96101
kernels:

backends/cadence/cadence.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ set(CMAKE_CXX_COMPILER ${TOOLCHAIN_HOME}/bin/${CROSS_COMPILE_TARGET}-clang++)
4343

4444
set(CMAKE_C_FLAGS_INIT "-stdlib=libc++ -mtext-section-literals -mlongcalls")
4545
set(CMAKE_CXX_FLAGS_INIT "-stdlib=libc++ -mtext-section-literals -mlongcalls")
46+
#workaround for larger compilation time
47+
set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -fno-strict-aliasing")
48+
4649
set(CMAKE_SYSROOT ${TOOLCHAIN_HOME}/${SYSROOT_TARGET})
4750
set(CMAKE_LINKER ${TOOLCHAIN_HOME}/bin/xt-ld)
4851
add_link_options(-lm -stdlib=libc++ -Wl,--no-as-needed -static)

backends/cadence/hifi/kernels/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ add_library(
99
cadence_kernels
1010
kernels.cpp
1111
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
12+
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
13+
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
14+
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
15+
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
1216
)
1317
# Let files say "include <executorch/path/to/header.h>".
1418
set(_common_include_directories ${EXECUTORCH_ROOT}/..)

backends/cadence/hifi/kernels/kernels.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,49 @@
1111
#include <inttypes.h>
1212
#include <stddef.h>
1313
#include <xa_type_def.h>
14+
/* For NNLIB APIs */
15+
#include "xa_nnlib_kernels_api.h"
16+
17+
/* Potential NNLIB function/APIs */
18+
extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
19+
FLOAT32* __restrict__ p_out,
20+
const WORD32* const p_out_shape,
21+
const FLOAT32* __restrict__ p_inp1,
22+
const WORD32* const p_inp1_shape,
23+
const FLOAT32* __restrict__ p_inp2,
24+
const WORD32* const p_inp2_shape);
25+
26+
extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(
27+
FLOAT32* __restrict__ p_out,
28+
const WORD32* const p_out_shape,
29+
const FLOAT32* __restrict__ p_inp1,
30+
const WORD32* const p_inp1_shape,
31+
const FLOAT32* __restrict__ p_inp2,
32+
const WORD32* const p_inp2_shape);
33+
34+
extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32(
35+
FLOAT32* __restrict__ p_out,
36+
const FLOAT32* __restrict__ p_inp1,
37+
const FLOAT32* __restrict__ p_inp2,
38+
WORD32 num_elm,
39+
WORD32 mode);
40+
41+
extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
42+
FLOAT32* __restrict__ p_out,
43+
const WORD32* const p_out_shape,
44+
const FLOAT32* __restrict__ p_inp1,
45+
const WORD32* const p_inp1_shape,
46+
const FLOAT32* __restrict__ p_inp2,
47+
const WORD32* const p_inp2_shape,
48+
WORD32 mode);
49+
50+
extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
51+
FLOAT32* __restrict__ p_out,
52+
const WORD32* const p_out_shape,
53+
const FLOAT32* __restrict__ p_inp1,
54+
const WORD32* const p_inp1_shape,
55+
const FLOAT32* __restrict__ p_inp2,
56+
const WORD32* const p_inp2_shape);
1457

1558
namespace cadence {
1659
namespace impl {

backends/cadence/hifi/operators/CMakeLists.txt

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@ endif()
2020

2121
# ATen compliant ops that are needed to run this model.
2222
set(_aten_ops__srcs
23+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
24+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
25+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
26+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
27+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
28+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
2329
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
2430
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
2531
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
@@ -29,24 +35,29 @@ set(_aten_ops__srcs
2935
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
3036
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
3137
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
32-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_add.cpp"
3338
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
3439
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
3540
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
36-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp"
3741
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp"
3842
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp"
39-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mul.cpp"
4043
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp"
41-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sigmoid.cpp"
4244
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
4345
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp"
4446
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"
45-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sub.cpp"
4647
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp"
4748
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp"
4849
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp"
49-
)
50+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp"
51+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
52+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
53+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
54+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
55+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/index_util.cpp"
56+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/kernel_ops_util.cpp"
57+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
58+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
59+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
60+
)
5061
add_library(aten_ops_cadence ${_aten_ops__srcs})
5162
target_link_libraries(aten_ops_cadence PUBLIC executorch)
5263
target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/cadence/hifi/kernels/kernels.h>
10+
#include <executorch/kernels/portable/cpu/scalar_utils.h>
11+
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
12+
#include <executorch/kernels/portable/cpu/util/functional_util.h>
13+
#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
14+
#include <executorch/runtime/kernel/kernel_includes.h>
15+
#include <executorch/runtime/platform/assert.h>
16+
17+
using exec_aten::Scalar;
18+
using exec_aten::ScalarType;
19+
using exec_aten::Tensor;
20+
using executorch::runtime::can_cast;
21+
using executorch::runtime::CppTypeToScalarType;
22+
using executorch::runtime::KernelRuntimeContext;
23+
using torch::executor::Error;
24+
25+
namespace impl {
26+
namespace HiFi {
27+
namespace native {
28+
29+
namespace {
30+
template <
31+
bool can_cast,
32+
typename CTYPE_A,
33+
typename CTYPE_B,
34+
typename CTYPE_IN,
35+
typename CTYPE_OUT>
36+
struct AddInner;
37+
38+
template <
39+
typename CTYPE_A,
40+
typename CTYPE_B,
41+
typename CTYPE_IN,
42+
typename CTYPE_OUT>
43+
struct AddInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
44+
static void
45+
run(const Tensor& a, const Tensor& b, CTYPE_IN alpha_val, Tensor& out) {
46+
torch::executor::apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
47+
// NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
48+
[alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
49+
CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
50+
CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
51+
CTYPE_IN value = a_casted + alpha_val * b_casted;
52+
53+
return static_cast<CTYPE_OUT>(value);
54+
},
55+
a,
56+
b,
57+
out);
58+
}
59+
};
60+
61+
template <typename CTYPE_IN>
62+
struct ReportCanCastBug {
63+
static void run(const Tensor&, const Tensor&, CTYPE_IN, Tensor&) {
64+
ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
65+
}
66+
};
67+
68+
template <
69+
typename CTYPE_A,
70+
typename CTYPE_B,
71+
typename CTYPE_IN,
72+
typename CTYPE_OUT>
73+
struct AddInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
74+
: public ReportCanCastBug<CTYPE_IN> {};
75+
76+
} // namespace
77+
78+
Tensor& add_out(
79+
KernelRuntimeContext& ctx,
80+
const Tensor& a,
81+
const Tensor& b,
82+
const Scalar& alpha,
83+
Tensor& out) {
84+
ET_KERNEL_CHECK(
85+
ctx,
86+
torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
87+
InvalidArgument,
88+
out);
89+
90+
ET_KERNEL_CHECK(
91+
ctx,
92+
executorch::runtime::tensor_is_realhbbf16_type(out),
93+
InvalidArgument,
94+
out);
95+
ET_KERNEL_CHECK(
96+
ctx,
97+
executorch::runtime::tensors_have_same_dim_order(a, b, out),
98+
InvalidArgument,
99+
out);
100+
101+
ScalarType a_type = a.scalar_type();
102+
ScalarType b_type = b.scalar_type();
103+
ScalarType alpha_type =
104+
torch::executor::native::utils::get_scalar_dtype(alpha);
105+
ScalarType common_type =
106+
executorch::runtime::promoteTypes(a_type, b_type, /*half_to_float*/ true);
107+
ScalarType out_type = out.scalar_type();
108+
109+
ET_KERNEL_CHECK(
110+
ctx,
111+
executorch::runtime::canCast(common_type, out_type),
112+
InvalidArgument,
113+
out);
114+
ET_KERNEL_CHECK(
115+
ctx,
116+
torch::executor::check_alpha_type(alpha_type, common_type),
117+
InvalidArgument,
118+
out);
119+
120+
float alpha_val;
121+
torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
122+
123+
constexpr auto name = "add.out";
124+
constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
125+
126+
int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
127+
bool optimized = 1;
128+
/*find broadcast*/
129+
const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
130+
const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
131+
const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
132+
int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
133+
max_dim = out.dim() > max_dim ? out.dim() : max_dim;
134+
135+
if ((out_type != ScalarType::Float) || (alpha_val != 1.0))
136+
optimized = 0;
137+
138+
if ((a_dim == 0) || (b_dim == 0))
139+
optimized = 0;
140+
141+
if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
142+
optimized = 0;
143+
144+
if (optimized) {
145+
const float* const a_data = a.const_data_ptr<float>();
146+
const float* const b_data = b.const_data_ptr<float>();
147+
float* const out_data = out.mutable_data_ptr<float>();
148+
149+
if (broadcast == 1) {
150+
int out_shape[kNnlibMaxDim];
151+
int inp1_shape[kNnlibMaxDim];
152+
int inp2_shape[kNnlibMaxDim];
153+
154+
for (int i = 0; i < kNnlibMaxDim; i++) {
155+
out_shape[i] = 1;
156+
inp1_shape[i] = 1;
157+
inp2_shape[i] = 1;
158+
}
159+
160+
int off_o = kNnlibMaxDim - out.dim();
161+
int off_a = kNnlibMaxDim - a.dim();
162+
int off_b = kNnlibMaxDim - b.dim();
163+
164+
for (int i = 0; i < out.dim(); i++)
165+
out_shape[i + off_o] = out.size(i);
166+
for (int i = 0; i < a.dim(); i++)
167+
inp1_shape[i + off_a] = a.size(i);
168+
for (int i = 0; i < b.dim(); i++)
169+
inp2_shape[i + off_b] = b.size(i);
170+
171+
xa_nn_elm_add_broadcast_4D_f32xf32_f32(
172+
out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
173+
} else {
174+
xa_nn_elm_add_f32xf32_f32(out_data, a_data, b_data, out.numel());
175+
}
176+
177+
return out;
178+
}
179+
180+
ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
181+
ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
182+
using CTYPE_IN = typename torch::executor::
183+
promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
184+
ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
185+
CTYPE_IN alpha_val;
186+
torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
187+
188+
ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
189+
AddInner<
190+
can_cast<CTYPE_IN, CTYPE_OUT>::value,
191+
CTYPE_A,
192+
CTYPE_B,
193+
CTYPE_IN,
194+
CTYPE_OUT>::run(a, b, alpha_val, out);
195+
});
196+
});
197+
});
198+
199+
return out;
200+
}
201+
202+
} // namespace native
203+
} // namespace HiFi
204+
} // namespace impl

0 commit comments

Comments
 (0)