Skip to content

Commit 2080877

Browse files
authored
Revert "Revert "Updating cadence ops with new name space, rebasing 6 optimize…"
Differential Revision: D65180409 Pull Request resolved: #6570
1 parent fd2844c commit 2080877

15 files changed

+2921
-12
lines changed

backends/cadence/aot/functions_hifi.yaml

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
- op: add.out
2626
kernels:
2727
- arg_meta: null
28-
kernel_name: torch::executor::add_out
28+
kernel_name: cadence::impl::HiFi::add_out
2929

3030
- op: bmm.out
3131
kernels:
@@ -45,12 +45,12 @@
4545
- op: div.out
4646
kernels:
4747
- arg_meta: null
48-
kernel_name: torch::executor::div_out
48+
kernel_name: cadence::impl::HiFi::div_out
4949

5050
- op: div.out_mode
5151
kernels:
5252
- arg_meta: null
53-
kernel_name: torch::executor::div_out_mode
53+
kernel_name: cadence::impl::HiFi::div_out_mode
5454

5555
- op: embedding.out
5656
kernels:
@@ -65,7 +65,7 @@
6565
- op: mul.out
6666
kernels:
6767
- arg_meta: null
68-
kernel_name: torch::executor::mul_out
68+
kernel_name: cadence::impl::HiFi::mul_out
6969

7070
- op: permute_copy.out
7171
kernels:
@@ -75,7 +75,7 @@
7575
- op: sigmoid.out
7676
kernels:
7777
- arg_meta: null
78-
kernel_name: torch::executor::sigmoid_out
78+
kernel_name: cadence::impl::HiFi::sigmoid_out
7979

8080
- op: slice_copy.Tensor_out
8181
kernels:
@@ -90,7 +90,12 @@
9090
- op: sub.out
9191
kernels:
9292
- arg_meta: null
93-
kernel_name: torch::executor::sub_out
93+
kernel_name: cadence::impl::HiFi::sub_out
94+
95+
- op: tanh.out
96+
kernels:
97+
- arg_meta: null
98+
kernel_name: cadence::impl::HiFi::tanh_out
9499

95100
- op: view_copy.out
96101
kernels:

backends/cadence/cadence.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ set(CMAKE_CXX_COMPILER ${TOOLCHAIN_HOME}/bin/${CROSS_COMPILE_TARGET}-clang++)
4343

4444
set(CMAKE_C_FLAGS_INIT "-stdlib=libc++ -mtext-section-literals -mlongcalls")
4545
set(CMAKE_CXX_FLAGS_INIT "-stdlib=libc++ -mtext-section-literals -mlongcalls")
46+
#workaround for larger compilation time
47+
set(CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS_INIT} -fno-strict-aliasing")
48+
4649
set(CMAKE_SYSROOT ${TOOLCHAIN_HOME}/${SYSROOT_TARGET})
4750
set(CMAKE_LINKER ${TOOLCHAIN_HOME}/bin/xt-ld)
4851
add_link_options(-lm -stdlib=libc++ -Wl,--no-as-needed -static)

backends/cadence/hifi/kernels/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ add_library(
99
cadence_kernels
1010
kernels.cpp
1111
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
12+
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
13+
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
14+
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
15+
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
1216
)
1317
# Let files say "include <executorch/path/to/header.h>".
1418
set(_common_include_directories ${EXECUTORCH_ROOT}/..)

backends/cadence/hifi/kernels/kernels.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,49 @@
1111
#include <inttypes.h>
1212
#include <stddef.h>
1313
#include <xa_type_def.h>
14+
/* For NNLIB APIs */
15+
#include "xa_nnlib_kernels_api.h"
16+
17+
/* Potential NNLIB function/APIs */
18+
extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
19+
FLOAT32* __restrict__ p_out,
20+
const WORD32* const p_out_shape,
21+
const FLOAT32* __restrict__ p_inp1,
22+
const WORD32* const p_inp1_shape,
23+
const FLOAT32* __restrict__ p_inp2,
24+
const WORD32* const p_inp2_shape);
25+
26+
extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(
27+
FLOAT32* __restrict__ p_out,
28+
const WORD32* const p_out_shape,
29+
const FLOAT32* __restrict__ p_inp1,
30+
const WORD32* const p_inp1_shape,
31+
const FLOAT32* __restrict__ p_inp2,
32+
const WORD32* const p_inp2_shape);
33+
34+
extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32(
35+
FLOAT32* __restrict__ p_out,
36+
const FLOAT32* __restrict__ p_inp1,
37+
const FLOAT32* __restrict__ p_inp2,
38+
WORD32 num_elm,
39+
WORD32 mode);
40+
41+
extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
42+
FLOAT32* __restrict__ p_out,
43+
const WORD32* const p_out_shape,
44+
const FLOAT32* __restrict__ p_inp1,
45+
const WORD32* const p_inp1_shape,
46+
const FLOAT32* __restrict__ p_inp2,
47+
const WORD32* const p_inp2_shape,
48+
WORD32 mode);
49+
50+
extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
51+
FLOAT32* __restrict__ p_out,
52+
const WORD32* const p_out_shape,
53+
const FLOAT32* __restrict__ p_inp1,
54+
const WORD32* const p_inp1_shape,
55+
const FLOAT32* __restrict__ p_inp2,
56+
const WORD32* const p_inp2_shape);
1457

1558
namespace cadence {
1659
namespace impl {

backends/cadence/hifi/operators/CMakeLists.txt

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@ endif()
2020

2121
# ATen compliant ops that are needed to run this model.
2222
set(_aten_ops__srcs
23+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
24+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
25+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
26+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
27+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
28+
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
2329
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
2430
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
2531
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
@@ -29,24 +35,29 @@ set(_aten_ops__srcs
2935
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
3036
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
3137
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
32-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_add.cpp"
3338
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
3439
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
3540
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
36-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp"
3741
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp"
3842
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp"
39-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_mul.cpp"
4043
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp"
41-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sigmoid.cpp"
4244
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp"
4345
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp"
4446
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp"
45-
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sub.cpp"
4647
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp"
4748
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp"
4849
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp"
49-
)
50+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp"
51+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
52+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
53+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
54+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
55+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/index_util.cpp"
56+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/kernel_ops_util.cpp"
57+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
58+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
59+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
60+
)
5061
add_library(aten_ops_cadence ${_aten_ops__srcs})
5162
target_link_libraries(aten_ops_cadence PUBLIC executorch)
5263
target_link_libraries(aten_ops_cadence PRIVATE cadence_kernels)
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/cadence/hifi/kernels/kernels.h>
10+
#include <executorch/kernels/portable/cpu/scalar_utils.h>
11+
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
12+
#include <executorch/kernels/portable/cpu/util/functional_util.h>
13+
#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
14+
#include <executorch/runtime/kernel/kernel_includes.h>
15+
#include <executorch/runtime/platform/assert.h>
16+
17+
using exec_aten::Scalar;
18+
using exec_aten::ScalarType;
19+
using exec_aten::Tensor;
20+
using executorch::runtime::can_cast;
21+
using executorch::runtime::CppTypeToScalarType;
22+
using executorch::runtime::KernelRuntimeContext;
23+
using torch::executor::Error;
24+
25+
namespace cadence {
26+
namespace impl {
27+
namespace HiFi {
28+
namespace native {
29+
30+
namespace {
31+
template <
32+
bool can_cast,
33+
typename CTYPE_A,
34+
typename CTYPE_B,
35+
typename CTYPE_IN,
36+
typename CTYPE_OUT>
37+
struct AddInner;
38+
39+
template <
40+
typename CTYPE_A,
41+
typename CTYPE_B,
42+
typename CTYPE_IN,
43+
typename CTYPE_OUT>
44+
struct AddInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
45+
static void
46+
run(const Tensor& a, const Tensor& b, CTYPE_IN alpha_val, Tensor& out) {
47+
torch::executor::apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
48+
// NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
49+
[alpha_val](const CTYPE_A val_a, const CTYPE_B val_b) {
50+
CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
51+
CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
52+
CTYPE_IN value = a_casted + alpha_val * b_casted;
53+
54+
return static_cast<CTYPE_OUT>(value);
55+
},
56+
a,
57+
b,
58+
out);
59+
}
60+
};
61+
62+
template <typename CTYPE_IN>
63+
struct ReportCanCastBug {
64+
static void run(const Tensor&, const Tensor&, CTYPE_IN, Tensor&) {
65+
ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
66+
}
67+
};
68+
69+
template <
70+
typename CTYPE_A,
71+
typename CTYPE_B,
72+
typename CTYPE_IN,
73+
typename CTYPE_OUT>
74+
struct AddInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
75+
: public ReportCanCastBug<CTYPE_IN> {};
76+
77+
} // namespace
78+
79+
Tensor& add_out(
80+
KernelRuntimeContext& ctx,
81+
const Tensor& a,
82+
const Tensor& b,
83+
const Scalar& alpha,
84+
Tensor& out) {
85+
ET_KERNEL_CHECK(
86+
ctx,
87+
torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
88+
InvalidArgument,
89+
out);
90+
91+
ET_KERNEL_CHECK(
92+
ctx,
93+
executorch::runtime::tensor_is_realhbbf16_type(out),
94+
InvalidArgument,
95+
out);
96+
ET_KERNEL_CHECK(
97+
ctx,
98+
executorch::runtime::tensors_have_same_dim_order(a, b, out),
99+
InvalidArgument,
100+
out);
101+
102+
ScalarType a_type = a.scalar_type();
103+
ScalarType b_type = b.scalar_type();
104+
ScalarType alpha_type =
105+
torch::executor::native::utils::get_scalar_dtype(alpha);
106+
ScalarType common_type =
107+
executorch::runtime::promoteTypes(a_type, b_type, /*half_to_float*/ true);
108+
ScalarType out_type = out.scalar_type();
109+
110+
ET_KERNEL_CHECK(
111+
ctx,
112+
executorch::runtime::canCast(common_type, out_type),
113+
InvalidArgument,
114+
out);
115+
ET_KERNEL_CHECK(
116+
ctx,
117+
torch::executor::check_alpha_type(alpha_type, common_type),
118+
InvalidArgument,
119+
out);
120+
121+
float alpha_val;
122+
torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
123+
124+
constexpr auto name = "add.out";
125+
constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
126+
127+
int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
128+
bool optimized = 1;
129+
/*find broadcast*/
130+
const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
131+
const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
132+
const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
133+
int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
134+
max_dim = out.dim() > max_dim ? out.dim() : max_dim;
135+
136+
if ((out_type != ScalarType::Float) || (alpha_val != 1.0))
137+
optimized = 0;
138+
139+
if ((a_dim == 0) || (b_dim == 0))
140+
optimized = 0;
141+
142+
if ((broadcast == 1) && (max_dim > kNnlibMaxDim))
143+
optimized = 0;
144+
145+
if (optimized) {
146+
const float* const a_data = a.const_data_ptr<float>();
147+
const float* const b_data = b.const_data_ptr<float>();
148+
float* const out_data = out.mutable_data_ptr<float>();
149+
150+
if (broadcast == 1) {
151+
int out_shape[kNnlibMaxDim];
152+
int inp1_shape[kNnlibMaxDim];
153+
int inp2_shape[kNnlibMaxDim];
154+
155+
for (int i = 0; i < kNnlibMaxDim; i++) {
156+
out_shape[i] = 1;
157+
inp1_shape[i] = 1;
158+
inp2_shape[i] = 1;
159+
}
160+
161+
int off_o = kNnlibMaxDim - out.dim();
162+
int off_a = kNnlibMaxDim - a.dim();
163+
int off_b = kNnlibMaxDim - b.dim();
164+
165+
for (int i = 0; i < out.dim(); i++)
166+
out_shape[i + off_o] = out.size(i);
167+
for (int i = 0; i < a.dim(); i++)
168+
inp1_shape[i + off_a] = a.size(i);
169+
for (int i = 0; i < b.dim(); i++)
170+
inp2_shape[i + off_b] = b.size(i);
171+
172+
xa_nn_elm_add_broadcast_4D_f32xf32_f32(
173+
out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
174+
} else {
175+
xa_nn_elm_add_f32xf32_f32(out_data, a_data, b_data, out.numel());
176+
}
177+
178+
return out;
179+
}
180+
181+
ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
182+
ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
183+
using CTYPE_IN = typename torch::executor::
184+
promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
185+
ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
186+
CTYPE_IN alpha_val;
187+
torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
188+
189+
ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
190+
AddInner<
191+
can_cast<CTYPE_IN, CTYPE_OUT>::value,
192+
CTYPE_A,
193+
CTYPE_B,
194+
CTYPE_IN,
195+
CTYPE_OUT>::run(a, b, alpha_val, out);
196+
});
197+
});
198+
});
199+
200+
return out;
201+
}
202+
203+
} // namespace native
204+
} // namespace HiFi
205+
} // namespace impl
206+
} // namespace cadence

0 commit comments

Comments
 (0)