Skip to content

Commit c430cc2

Browse files
ckmadhirafacebook-github-bot
authored andcommitted
Cadence fusiong3 operators m2 (#7490)
Summary: Added new operators sub, div, exp, permute, slice, mean in backends/cadence/fusion_g3 For cycle reduction, disabled error checks in operators using macro "OPT_ARG_CHECK" Pull Request resolved: #7490 Differential Revision: D67870337 Pulled By: zonglinpeng
1 parent 68c0208 commit c430cc2

18 files changed

+2136
-287
lines changed

.gitmodules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666
url = https://github.com/pybind/pybind11.git
6767
[submodule "backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3"]
6868
path = backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3
69-
url = https://github.com/foss-xtensa/nnlib-FusionG3/
69+
url = https://github.com/foss-xtensa/nnlib-FusionG3.git
7070
[submodule "third-party/ao"]
7171
path = third-party/ao
7272
url = https://github.com/pytorch/ao.git

backends/cadence/aot/functions_fusion_g3.yaml

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,12 @@
5050
- op: div.out
5151
kernels:
5252
- arg_meta: null
53-
kernel_name: torch::executor::div_out
53+
kernel_name: cadence::impl::G3::div_out
5454

5555
- op: div.out_mode
5656
kernels:
5757
- arg_meta: null
58-
kernel_name: torch::executor::div_out_mode
58+
kernel_name: cadence::impl::G3::div_out_mode
5959

6060
- op: embedding.out
6161
kernels:
@@ -80,7 +80,7 @@
8080
- op: permute_copy.out
8181
kernels:
8282
- arg_meta: null
83-
kernel_name: torch::executor::permute_copy_out
83+
kernel_name: cadence::impl::G3::permute_copy_out
8484

8585
- op: sigmoid.out
8686
kernels:
@@ -90,7 +90,7 @@
9090
- op: slice_copy.Tensor_out
9191
kernels:
9292
- arg_meta: null
93-
kernel_name: torch::executor::slice_copy_Tensor_out
93+
kernel_name: cadence::impl::G3::slice_copy_Tensor_out
9494

9595
- op: split_with_sizes_copy.out
9696
kernels:
@@ -100,7 +100,12 @@
100100
- op: sub.out
101101
kernels:
102102
- arg_meta: null
103-
kernel_name: torch::executor::sub_out
103+
kernel_name: cadence::impl::G3::sub_out
104+
105+
- op: sub.Scalar_out
106+
kernels:
107+
- arg_meta: null
108+
kernel_name: cadence::impl::G3::sub_scalar_out
104109

105110
- op: view_copy.out
106111
kernels:
@@ -117,6 +122,16 @@
117122
- arg_meta: null
118123
kernel_name: cadence::impl::G3::native_layer_norm_out
119124

125+
- op: mean.out
126+
kernels:
127+
- arg_meta: null
128+
kernel_name: cadence::impl::G3::mean_dim_out
129+
130+
- op: exp.out
131+
kernels:
132+
- arg_meta: null
133+
kernel_name: cadence::impl::G3::exp_out
134+
120135
# custom ops
121136
- func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
122137
variants: function

backends/cadence/fusion_g3/operators/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ set(_aten_ops__srcs
3636
"${CMAKE_CURRENT_SOURCE_DIR}/op_native_layer_norm.cpp"
3737
"${CMAKE_CURRENT_SOURCE_DIR}/op_quantize.cpp"
3838
"${CMAKE_CURRENT_SOURCE_DIR}/op_dequantize.cpp"
39+
"${CMAKE_CURRENT_SOURCE_DIR}/op_sub.cpp"
40+
"${CMAKE_CURRENT_SOURCE_DIR}/op_div.cpp"
41+
"${CMAKE_CURRENT_SOURCE_DIR}/op_mean.cpp"
42+
"${CMAKE_CURRENT_SOURCE_DIR}/op_slice_copy.cpp"
43+
"${CMAKE_CURRENT_SOURCE_DIR}/op_permute_copy.cpp"
44+
"${CMAKE_CURRENT_SOURCE_DIR}/op_exp.cpp"
3945
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
4046
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
4147
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp"
@@ -51,6 +57,7 @@ set(_aten_ops__srcs
5157
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp"
5258
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
5359
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/normalization_ops_util.cpp"
60+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp"
5461
)
5562
add_library(aten_ops_cadence ${_aten_ops__srcs})
5663
target_link_libraries(aten_ops_cadence PUBLIC executorch)

backends/cadence/fusion_g3/operators/op_add.cpp

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,20 @@
1010

1111
#include <xa_nnlib_kernels_api.h>
1212

13-
#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
13+
#include <executorch/backends/cadence/fusion_g3/operators/tensor_util.h>
1414
#include <executorch/kernels/portable/cpu/scalar_utils.h>
1515
#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
1616
#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
1717
#include <executorch/runtime/kernel/kernel_includes.h>
1818
#include <executorch/runtime/platform/assert.h>
19+
#include <xa_nnlib_kernels_api.h>
1920

20-
using ::executorch::aten::Scalar;
21-
using ::executorch::aten::ScalarType;
22-
using ::executorch::aten::Tensor;
23-
using ::executorch::runtime::canCast;
24-
using ::executorch::runtime::Error;
25-
using ::executorch::runtime::KernelRuntimeContext;
21+
using exec_aten::Scalar;
22+
using exec_aten::ScalarType;
23+
using exec_aten::Tensor;
24+
using executorch::runtime::canCast;
25+
using torch::executor::Error;
26+
using torch::executor::KernelRuntimeContext;
2627

2728
namespace cadence {
2829
namespace impl {
@@ -39,6 +40,7 @@ Tensor& add_out(
3940
ScalarType common_type =
4041
executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type());
4142

43+
#ifdef OP_ARG_CHECK
4244
// Check Common Dtype
4345
ET_KERNEL_CHECK(
4446
ctx,
@@ -62,12 +64,12 @@ Tensor& add_out(
6264
torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
6365
InvalidArgument,
6466
out);
67+
#endif
6568

6669
// Compute Dtype
6770
ScalarType compute_type =
6871
torch::executor::native::utils::get_compute_type(common_type);
6972

70-
// @lint-ignore CLANGTIDY facebook-hte-CArray
7173
static constexpr const char op_name[] = "add.out";
7274

7375
int kTensorDimensionLimit = 5;
@@ -253,6 +255,7 @@ Tensor& add_scalar_out(
253255
torch::executor::native::utils::promote_type_with_scalar(
254256
a.scalar_type(), b);
255257

258+
#ifdef OP_ARG_CHECK
256259
// Check Common Dtype
257260
ET_KERNEL_CHECK(
258261
ctx,
@@ -276,7 +279,7 @@ Tensor& add_scalar_out(
276279
executorch::runtime::resize_tensor(out, a.sizes()) == Error::Ok,
277280
InvalidArgument,
278281
out);
279-
282+
#endif
280283
// Compute Dtype
281284
ScalarType compute_type =
282285
torch::executor::native::utils::get_compute_type(common_type);

backends/cadence/fusion_g3/operators/op_cat.cpp

Lines changed: 49 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,17 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
#include <cstring>
10-
11-
#include <xa_nnlib_kernels_api.h>
12-
9+
#include <executorch/backends/cadence/fusion_g3/operators/tensor_util.h>
1310
#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
1411
#include <executorch/runtime/kernel/kernel_includes.h>
12+
#include <xa_nnlib_kernels_api.h>
13+
#include <cstring>
1514

16-
using ::executorch::aten::ScalarType;
17-
using ::executorch::aten::Tensor;
18-
using ::executorch::runtime::Error;
19-
using ::executorch::runtime::KernelRuntimeContext;
15+
using exec_aten::Scalar;
16+
using exec_aten::ScalarType;
17+
using exec_aten::Tensor;
18+
using torch::executor::Error;
19+
using torch::executor::KernelRuntimeContext;
2020

2121
/* ScalarType in Executorch do not have support for below data types.
2222
* So, creating a placeholder for these data types. Once, ScalarTypes is
@@ -39,13 +39,15 @@ Tensor& cat_out(
3939
dim += out.dim();
4040
}
4141

42+
int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit;
43+
44+
#ifdef OP_ARG_CHECK
4245
ET_KERNEL_CHECK(
4346
ctx,
4447
torch::executor::check_cat_args(tensors, dim, out),
4548
InvalidArgument,
4649
out);
4750

48-
int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit;
4951
Tensor::SizesType expected_out_size[kTensorDimensionLimit];
5052
size_t expected_out_dim = 0;
5153
torch::executor::get_cat_out_target_size(
@@ -57,6 +59,20 @@ Tensor& cat_out(
5759
out, {expected_out_size, expected_out_dim}) == Error::Ok,
5860
InvalidArgument,
5961
out);
62+
#endif
63+
// Special handling when all inputs are 1D-empty tensors for aten
64+
// consistency In that case, just return an 1D-empty tensor without checking
65+
// dim
66+
bool all_1d_empty = true;
67+
for (size_t i = 0; i < tensors.size(); ++i) {
68+
if (tensors[i].numel() != 0 || tensors[i].dim() != 1) {
69+
all_1d_empty = false;
70+
break;
71+
}
72+
}
73+
if (all_1d_empty) {
74+
return out;
75+
}
6076

6177
const signed char* inp_tensors[tensors.size()];
6278
const int* inp_tensors_shapes[tensors.size()];
@@ -87,7 +103,10 @@ Tensor& cat_out(
87103
}
88104

89105
if (out.scalar_type() == ScalarType::Int) {
90-
xa_nn_cat(
106+
XT_KERNEL_CHECK(
107+
ctx,
108+
out,
109+
xa_nn_cat,
91110
out_data,
92111
out_shapes,
93112
inp_tensors,
@@ -97,7 +116,10 @@ Tensor& cat_out(
97116
(int)dim,
98117
sizeof(int));
99118
} else if (out.scalar_type() == ScalarType::Short) {
100-
xa_nn_cat(
119+
XT_KERNEL_CHECK(
120+
ctx,
121+
out,
122+
xa_nn_cat,
101123
out_data,
102124
out_shapes,
103125
inp_tensors,
@@ -107,7 +129,10 @@ Tensor& cat_out(
107129
(int)dim,
108130
sizeof(short));
109131
} else if (out.scalar_type() == ScalarType::Char) {
110-
xa_nn_cat(
132+
XT_KERNEL_CHECK(
133+
ctx,
134+
out,
135+
xa_nn_cat,
111136
out_data,
112137
out_shapes,
113138
inp_tensors,
@@ -117,7 +142,10 @@ Tensor& cat_out(
117142
(int)dim,
118143
sizeof(char));
119144
} else if (out.scalar_type() == (ScalarType)Uint) {
120-
xa_nn_cat(
145+
XT_KERNEL_CHECK(
146+
ctx,
147+
out,
148+
xa_nn_cat,
121149
out_data,
122150
out_shapes,
123151
inp_tensors,
@@ -127,7 +155,10 @@ Tensor& cat_out(
127155
(int)dim,
128156
sizeof(int));
129157
} else if (out.scalar_type() == (ScalarType)Ushort) {
130-
xa_nn_cat(
158+
XT_KERNEL_CHECK(
159+
ctx,
160+
out,
161+
xa_nn_cat,
131162
out_data,
132163
out_shapes,
133164
inp_tensors,
@@ -137,7 +168,10 @@ Tensor& cat_out(
137168
(int)dim,
138169
sizeof(short));
139170
} else if (out.scalar_type() == ScalarType::Byte) {
140-
xa_nn_cat(
171+
XT_KERNEL_CHECK(
172+
ctx,
173+
out,
174+
xa_nn_cat,
141175
out_data,
142176
out_shapes,
143177
inp_tensors,
@@ -148,19 +182,6 @@ Tensor& cat_out(
148182
sizeof(char));
149183

150184
} else {
151-
// Special handling when all inputs are 1D-empty tensors for aten
152-
// consistency In that case, just return an 1D-empty tensor without checking
153-
// dim
154-
bool all_1d_empty = true;
155-
for (size_t i = 0; i < tensors.size(); ++i) {
156-
if (tensors[i].numel() != 0 || tensors[i].dim() != 1) {
157-
all_1d_empty = false;
158-
break;
159-
}
160-
}
161-
if (all_1d_empty) {
162-
return out;
163-
}
164185
const size_t outer = executorch::runtime::getLeadingDims(out, dim);
165186
const size_t dim_stride = executorch::runtime::getTrailingDims(out, dim);
166187
const size_t ninputs = tensors.size();

0 commit comments

Comments
 (0)