Skip to content

Commit 0e09879

Browse files
zonglinpengfacebook-github-bot
authored andcommitted
port add sub mul div tanh sigmoid from oss, create new 3p buck targets, add kernel modification (#6601)
Summary: Done the three things as titled - create buck targets for add mul sub div sigmoid and tanh - create new thirdparty buck targets for internal use: the OSS version is unique and leading to the GH version. by buckify the “staging” targets it’s much faster for us to get to the latest kernels. - modified cadence kernels to use the XT_ APIs Reviewed By: hsharma35 Differential Revision: D65300260
1 parent 96a9d35 commit 0e09879

File tree

16 files changed

+449
-152
lines changed

16 files changed

+449
-152
lines changed

backends/cadence/aot/compiler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,12 +235,13 @@ def quantize_and_export_to_cadence(
235235
def export_to_executorch_gen_etrecord(
236236
model: torch.nn.Module,
237237
inputs: tuple[object, ...],
238-
dump_graphs: bool = False,
239238
output_dir: Optional[str] = None,
240239
opt_level: int = 1,
240+
dump_graphs: bool = False,
241241
) -> ExecutorchProgramManager:
242242
edge_prog_manager = export_to_edge(model, inputs)
243243
cadence_passes = get_cadence_passes(opt_level)
244+
edge_prog_manager = export_to_edge(model, inputs, dump_graphs)
244245

245246
# Run a couple required passes for quant/dequant ops
246247
cadence_prog_manager = edge_prog_manager.transform(

backends/cadence/hifi/operators/op_add.cpp

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#include <executorch/backends/cadence/hifi/kernels/kernels.h>
1010
#include <executorch/kernels/portable/cpu/scalar_utils.h>
1111
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
12+
#include <executorch/kernels/portable/cpu/util/dtype_util.h>
13+
#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
1214
#include <executorch/kernels/portable/cpu/util/functional_util.h>
1315
#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
1416
#include <executorch/runtime/kernel/kernel_includes.h>
@@ -121,7 +123,7 @@ Tensor& add_out(
121123
float alpha_val;
122124
torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
123125

124-
constexpr auto name = "add.out";
126+
static constexpr const char op_name[] = "add.out";
125127
constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
126128

127129
int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
@@ -178,23 +180,25 @@ Tensor& add_out(
178180
return out;
179181
}
180182

181-
ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
182-
ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
183-
using CTYPE_IN = typename torch::executor::
184-
promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
185-
ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
186-
CTYPE_IN alpha_val;
187-
torch::executor::native::utils::extract_scalar(alpha, &alpha_val);
188-
189-
ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
190-
AddInner<
191-
can_cast<CTYPE_IN, CTYPE_OUT>::value,
192-
CTYPE_A,
193-
CTYPE_B,
194-
CTYPE_IN,
195-
CTYPE_OUT>::run(a, b, alpha_val, out);
196-
});
197-
});
183+
// Compute Dtype
184+
ScalarType compute_type =
185+
torch::executor::native::utils::get_compute_type(common_type);
186+
187+
ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
188+
const CTYPE_COMPUTE val_alpha =
189+
torch::executor::native::utils::scalar_to<CTYPE_COMPUTE>(alpha);
190+
torch::executor::native::utils::
191+
apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
192+
[val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
193+
return val_a + val_alpha * val_b;
194+
},
195+
ctx,
196+
a,
197+
torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
198+
b,
199+
torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
200+
out,
201+
torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16);
198202
});
199203

200204
return out;

backends/cadence/hifi/operators/op_div.cpp

Lines changed: 70 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#include <executorch/backends/cadence/hifi/kernels/kernels.h>
1010
#include <executorch/kernels/portable/cpu/scalar_utils.h>
1111
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
12+
#include <executorch/kernels/portable/cpu/util/dtype_util.h>
13+
#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
1214
#include <executorch/kernels/portable/cpu/util/functional_util.h>
1315
#include <executorch/kernels/portable/cpu/util/math_util.h>
1416
#include <executorch/runtime/kernel/kernel_includes.h>
@@ -134,25 +136,26 @@ div_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
134136
InvalidArgument,
135137
out);
136138

137-
ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out", CTYPE_A, [&]() {
138-
ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out", CTYPE_B, [&]() {
139-
ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out", CTYPE_IN, [&]() {
140-
ET_SWITCH_FLOAT_TYPES(out_type, ctx, "div.out", CTYPE_OUT, [&]() {
141-
torch::executor::
142-
apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
143-
[](const CTYPE_A val_a, const CTYPE_B val_b) {
144-
CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
145-
CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
146-
CTYPE_IN value = a_casted / b_casted;
147-
148-
return static_cast<CTYPE_OUT>(value);
149-
},
150-
a,
151-
b,
152-
out);
153-
});
154-
});
155-
});
139+
// Compute Dtype
140+
ScalarType compute_type =
141+
torch::executor::native::utils::get_compute_type(common_type);
142+
143+
// @lint-ignore CLANGTIDY facebook-hte-CArray
144+
static constexpr const char op_name[] = "div.out";
145+
146+
ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
147+
torch::executor::native::utils::
148+
apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
149+
[](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
150+
return val_a / val_b;
151+
},
152+
ctx,
153+
a,
154+
torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
155+
b,
156+
torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
157+
out,
158+
torch::executor::native::utils::SupportedTensorDtypes::FLOATHBF16);
156159
});
157160

158161
return out;
@@ -254,35 +257,59 @@ Tensor& div_out_mode(
254257
return out;
255258
}
256259

257-
ET_SWITCH_REAL_TYPES_AND(Bool, a_type, ctx, "div.out_mode", CTYPE_A, [&]() {
258-
ET_SWITCH_REAL_TYPES_AND(Bool, b_type, ctx, "div.out_mode", CTYPE_B, [&]() {
259-
ET_SWITCH_FLOAT_TYPES(common_type, ctx, "div.out_mode", CTYPE_IN, [&]() {
260-
ET_SWITCH_REAL_TYPES(out_type, ctx, "div.out_mode", CTYPE_OUT, [&]() {
261-
torch::executor::
262-
apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
263-
[mode](const CTYPE_A val_a, const CTYPE_B val_b) {
264-
CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
265-
CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
266-
CTYPE_IN value = a_casted / b_casted;
267-
if (mode.has_value() && mode.value() == "trunc") {
268-
value = std::trunc(value);
269-
} else if (mode.has_value() && mode.value() == "floor") {
270-
value = std::floor(value);
271-
}
272-
return static_cast<CTYPE_OUT>(value);
273-
},
274-
a,
275-
b,
276-
out);
277-
});
278-
});
279-
});
260+
bool div_by_zero_error = false;
261+
const bool mode_is_trunc = (mode.has_value() && mode.value() == "trunc");
262+
// Compute Dtype
263+
ScalarType compute_type =
264+
torch::executor::native::utils::get_compute_type(common_type);
265+
266+
// @lint-ignore CLANGTIDY facebook-hte-CArray
267+
static constexpr const char op_name[] = "div.out";
268+
269+
ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
270+
torch::executor::native::utils::
271+
apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
272+
[mode_is_trunc, &div_by_zero_error](
273+
const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
274+
if (executorch::runtime::is_integral_type<
275+
CTYPE_COMPUTE,
276+
/*includeBool=*/true>::value) {
277+
if (val_b == 0) {
278+
div_by_zero_error = true;
279+
return static_cast<CTYPE_COMPUTE>(0);
280+
}
281+
}
282+
CTYPE_COMPUTE value = val_a / val_b;
283+
if (mode_is_trunc) {
284+
value = std::trunc(value);
285+
} else {
286+
// We established above that the mode is either trunc or floor,
287+
// so it must be floor.
288+
value =
289+
torch::executor::native::utils::floor_divide(val_a, val_b);
290+
}
291+
return value;
292+
},
293+
ctx,
294+
a,
295+
torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
296+
b,
297+
torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
298+
out,
299+
torch::executor::native::utils::SupportedTensorDtypes::REALHBF16);
280300
});
281301

302+
ET_KERNEL_CHECK_MSG(
303+
ctx,
304+
!div_by_zero_error,
305+
InvalidArgument,
306+
out,
307+
"Div mode operation encountered integer division by zero");
308+
282309
return out;
283310
}
284311

285312
} // namespace native
286313
} // namespace HiFi
287314
} // namespace impl
288-
} // namespace cadence
315+
} // namespace cadence

backends/cadence/hifi/operators/op_mean.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#include <executorch/kernels/portable/cpu/util/dtype_util.h>
910
#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
1011
#include <executorch/kernels/portable/cpu/util/reduce_util.h>
1112
#include <executorch/runtime/kernel/kernel_includes.h>
@@ -141,11 +142,11 @@ Tensor& mean_dim_out(
141142
return out;
142143
}
143144

144-
ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] {
145-
ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] {
145+
ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, "mean.out", CTYPE_IN, [&] {
146+
ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, "mean.out", CTYPE_OUT, [&] {
146147
CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
147-
const size_t num = torch::executor::get_reduced_dim_product(in, dim_list);
148-
148+
const size_t num =
149+
torch::executor::exeget_reduced_dim_product(in, dim_list);
149150
for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
150151
CTYPE_OUT sum = 0;
151152
if (in.numel() > 0) {

backends/cadence/hifi/operators/op_mul.cpp

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#include <executorch/backends/cadence/hifi/kernels/kernels.h>
1010
#include <executorch/kernels/portable/cpu/scalar_utils.h>
1111
#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
12+
#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
13+
#include <executorch/kernels/portable/cpu/util/dtype_util.h>
1214
#include <executorch/kernels/portable/cpu/util/functional_util.h>
1315
#include <executorch/runtime/kernel/kernel_includes.h>
1416
#include <executorch/runtime/platform/assert.h>
@@ -144,20 +146,24 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
144146
return out;
145147
}
146148

147-
ET_SWITCH_REALHB_TYPES(a_type, ctx, "mul.out", CTYPE_A, [&]() {
148-
ET_SWITCH_REALHB_TYPES(b_type, ctx, "mul.out", CTYPE_B, [&]() {
149-
using CTYPE_IN = typename torch::executor::
150-
promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
151-
ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
152-
ET_SWITCH_REALHB_TYPES(out_type, ctx, "mul.out", CTYPE_OUT, [&]() {
153-
MulInner<
154-
can_cast<CTYPE_IN, CTYPE_OUT>::value,
155-
CTYPE_A,
156-
CTYPE_B,
157-
CTYPE_IN,
158-
CTYPE_OUT>::run(a, b, out);
159-
});
160-
});
149+
// Compute Dtype
150+
ScalarType compute_type = torch::executor::native::utils::get_compute_type(common_type);
151+
152+
// @lint-ignore CLANGTIDY facebook-hte-CArray
153+
static constexpr const char op_name[] = "mul.Scalar_out";
154+
155+
ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
156+
torch::executor::native::utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
157+
[](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
158+
return val_a * val_b;
159+
},
160+
ctx,
161+
a,
162+
torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
163+
b,
164+
torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
165+
out,
166+
torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16);
161167
});
162168

163169
return out;
@@ -166,4 +172,4 @@ mul_out(RuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) {
166172
} // namespace native
167173
} // namespace HiFi
168174
} // namespace impl
169-
} // namespace cadence
175+
} // namespace cadence

backends/cadence/hifi/operators/op_sigmoid.cpp

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#include <cmath>
1010

1111
#include <executorch/backends/cadence/hifi/kernels/kernels.h>
12+
#include <executorch/kernels/portable/cpu/util/dtype_util.h>
13+
#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
1214
#include <executorch/kernels/portable/cpu/util/functional_util.h>
1315
#include <executorch/runtime/kernel/kernel_includes.h>
1416

@@ -58,19 +60,26 @@ Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
5860
return out;
5961
}
6062

61-
ET_SWITCH_REALHB_TYPES(in_type, ctx, "sigmoid.out", CTYPE_IN, [&]() {
62-
ET_SWITCH_FLOATH_TYPES(out_type, ctx, "sigmoid.out", CTYPE_OUT, [&]() {
63-
torch::executor::apply_unary_map_fn(
64-
[](const CTYPE_IN val_in) {
65-
// perform math in double to preserve precision
66-
double in_casted = static_cast<double>(val_in);
67-
double out_val = 1.0 / (1.0 + exp(-in_casted));
68-
return static_cast<CTYPE_OUT>(out_val);
69-
},
70-
in.const_data_ptr<CTYPE_IN>(),
71-
out.mutable_data_ptr<CTYPE_OUT>(),
72-
in.numel());
73-
});
63+
ScalarType compute_type =
64+
executorch::runtime::isFloatingType(in.scalar_type()) ? in.scalar_type()
65+
: ScalarType::Float;
66+
compute_type = torch::executor::native::utils::get_compute_type(compute_type);
67+
68+
// @lint-ignore CLANGTIDY facebook-hte-CArray
69+
static constexpr const char op_name[] = "sigmoid.out";
70+
71+
ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
72+
torch::executor::native::utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
73+
[](const CTYPE_COMPUTE val_in) {
74+
CTYPE_COMPUTE out_val = static_cast<CTYPE_COMPUTE>(1.0) /
75+
(static_cast<CTYPE_COMPUTE>(1.0) + exp(-val_in));
76+
return out_val;
77+
},
78+
ctx,
79+
in,
80+
torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16,
81+
out,
82+
torch::executor::native::utils::SupportedTensorDtypes::FLOATHBF16);
7483
});
7584

7685
return out;
@@ -79,4 +88,4 @@ Tensor& sigmoid_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
7988
} // namespace native
8089
} // namespace HiFi
8190
} // namespace impl
82-
} // namespace cadence
91+
} // namespace cadence

0 commit comments

Comments
 (0)