7
7
*/
8
8
9
9
#include < executorch/backends/cadence/hifi/kernels/kernels.h>
10
+ #include < executorch/backends/cadence/hifi/operators/operators.h>
10
11
#include < executorch/runtime/kernel/kernel_includes.h>
12
+ #include < xa_nnlib_kernels_api.h>
13
+ #include < xtensa/tie/xt_datacache.h>
11
14
#include < algorithm>
12
15
#include < cmath>
16
+ #include < optional>
13
17
14
18
namespace cadence {
15
19
namespace impl {
16
20
namespace HiFi {
17
21
namespace native {
18
22
19
- using executorch::aten::Tensor;
20
- using executorch::runtime::getLeadingDims;
21
- using executorch::runtime::KernelRuntimeContext;
23
+ using ::executorch::aten::optional;
24
+ using ::executorch::aten::ScalarType;
25
+ using ::executorch::aten::Tensor;
26
+ using ::executorch::runtime::getLeadingDims;
27
+ using ::executorch::runtime::KernelRuntimeContext;
22
28
23
- void quantized_linear_out (
24
- KernelRuntimeContext& ctx,
25
- const Tensor& src,
29
+ void _quantized_linear_asym8u (
30
+ const Tensor& in,
26
31
const Tensor& weight,
27
32
const Tensor& bias,
28
- int64_t src_zero_point ,
33
+ int64_t in_zero_point ,
29
34
const Tensor& weight_zero_point,
30
35
const Tensor& out_multiplier,
31
36
const Tensor& out_shift,
32
37
int64_t out_zero_point,
33
- const executorch::aten:: optional<Tensor>& offset,
38
+ __ET_UNUSED const optional<Tensor>& offset,
34
39
Tensor& out) {
35
40
// input comes in shape [leading_dims, in_dim]
36
41
// weight comes in shape [out_dim, in_dim]
37
42
// output comes in empty with shape [leading_dims, out_dim]
38
43
// Perform matrix multiply (M x N) x (N x P)' => M x P
39
- int64_t leading_dims = getLeadingDims (src, src .dim () - 1 );
40
- int64_t out_dim = weight.size (0 ); // = out_dim
41
- int64_t in_dim = weight.size (1 ); // = in_dim
44
+ const int64_t leading_dims = getLeadingDims (in, in .dim () - 1 );
45
+ const int64_t out_dim = weight.size (0 ); // = out_dim
46
+ const int64_t in_dim = weight.size (1 ); // = in_dim
42
47
43
- const uint8_t * __restrict__ in_data = src .const_data_ptr <uint8_t >();
48
+ const uint8_t * __restrict__ in_data = in .const_data_ptr <uint8_t >();
44
49
const uint8_t * __restrict__ weight_data = weight.const_data_ptr <uint8_t >();
45
50
const int32_t * __restrict__ bias_data = bias.const_data_ptr <int32_t >();
46
51
uint8_t * __restrict__ out_data = out.mutable_data_ptr <uint8_t >();
47
52
48
53
// The nnlib kernel to compute quantized linear via matmul.
49
- int32_t ret = cadence::impl::HiFi::kernels::matmul_asym8uxasym8u_asym8u (
54
+ int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u (
50
55
out_data, // p_out
51
56
weight_data, // p_mat1,
52
57
in_data, // p_mat2,
@@ -59,14 +64,238 @@ void quantized_linear_out(
59
64
out_dim, // out_offset, i.e., offset of next output element written
60
65
1 , // out_stride, i.e., stride to go to next output row
61
66
-weight_zero_point.const_data_ptr <int32_t >()[0 ], // mat1_zero_bias
62
- -src_zero_point, // mat2_zero_bias
63
- out_multiplier.const_data_ptr <int32_t >(), // out_multiplier
64
- out_shift.const_data_ptr <int32_t >(), // out_shift
65
- out_zero_point, // out_zero_bias
66
- false ); // per channel quantization
67
+ -in_zero_point, // mat2_zero_bias
68
+ out_multiplier.const_data_ptr <int32_t >()[0 ], // out_multiplier
69
+ out_shift.const_data_ptr <int32_t >()[0 ], // out_shift
70
+ out_zero_point); // out_zero_bias
67
71
ET_DCHECK_MSG (ret == 0 , " HiFi quantized::linear failed" );
68
72
}
69
73
74
+ void inline _quantized_linear_asym8s (
75
+ const Tensor& in,
76
+ const Tensor& weight,
77
+ const Tensor& bias,
78
+ int64_t in_zero_point,
79
+ const Tensor& weight_zero_point,
80
+ const Tensor& out_multiplier,
81
+ const Tensor& out_shift,
82
+ int64_t out_zero_point,
83
+ __ET_UNUSED const optional<Tensor>& offset,
84
+ Tensor& out) {
85
+ // input comes in shape [leading_dims, in_dim]
86
+ // weight comes in shape [out_dim, in_dim]
87
+ // output comes in empty with shape [leading_dims, out_dim]
88
+ // Perform matrix multiply (M x N) x (N x P)' => M x P
89
+ const int64_t leading_dims = getLeadingDims (in, in.dim () - 1 );
90
+ const int64_t out_dim = weight.size (0 ); // = out_dim
91
+ const int64_t in_dim = weight.size (1 ); // = in_dim
92
+
93
+ const int8_t * __restrict__ in_data = in.const_data_ptr <int8_t >();
94
+ const int8_t * __restrict__ weight_data = weight.const_data_ptr <int8_t >();
95
+ const int32_t * __restrict__ bias_data = bias.const_data_ptr <int32_t >();
96
+ int8_t * __restrict__ out_data = out.mutable_data_ptr <int8_t >();
97
+
98
+ // The nnlib kernel to compute quantized linear via matmul.
99
+ int32_t ret = xa_nn_matmul_asym8sxasym8s_asym8s (
100
+ out_data, // p_out
101
+ weight_data, // p_mat1,
102
+ in_data, // p_mat2,
103
+ bias_data, // p_bias
104
+ out_dim, // rows of p_mat1
105
+ in_dim, // cols of p_mat1
106
+ in_dim, // row_stride of p_mat1
107
+ leading_dims, // vec_count, i.e., rows of p_mat2
108
+ in_dim, // vec_offset of p_mat2.
109
+ out_dim, // out_offset, i.e., offset of next output element written
110
+ 1 , // out_stride, i.e., stride to go to next output row
111
+ -weight_zero_point.const_data_ptr <int32_t >()[0 ], // mat1_zero_bias
112
+ -in_zero_point, // mat2_zero_bias
113
+ out_multiplier.const_data_ptr <int32_t >()[0 ], // out_multiplier
114
+ out_shift.const_data_ptr <int32_t >()[0 ], // out_shift
115
+ out_zero_point); // out_zero_bias
116
+ ET_DCHECK_MSG (ret == 0 , " HiFi quantized::linear failed" );
117
+ }
118
+
119
+ void inline _quantized_linear_per_tensor_asym8u (
120
+ const Tensor& in,
121
+ const Tensor& weight,
122
+ const Tensor& bias,
123
+ int64_t in_zero_point,
124
+ int64_t weight_zero_point,
125
+ int64_t out_multiplier,
126
+ int64_t out_shift,
127
+ int64_t out_zero_point,
128
+ __ET_UNUSED const optional<Tensor>& offset,
129
+ Tensor& out) {
130
+ // input comes in shape [leading_dims, in_dim]
131
+ // weight comes in shape [out_dim, in_dim]
132
+ // output comes in empty with shape [leading_dims, out_dim]
133
+ // Perform matrix multiply (M x N) x (N x P)' => M x P
134
+ const int64_t leading_dims = getLeadingDims (in, in.dim () - 1 );
135
+ const int64_t out_dim = weight.size (0 ); // = out_dim
136
+ const int64_t in_dim = weight.size (1 ); // = in_dim
137
+
138
+ const uint8_t * __restrict__ in_data = in.const_data_ptr <uint8_t >();
139
+ const uint8_t * __restrict__ weight_data = weight.const_data_ptr <uint8_t >();
140
+ const int32_t * __restrict__ bias_data = bias.const_data_ptr <int32_t >();
141
+ uint8_t * __restrict__ out_data = out.mutable_data_ptr <uint8_t >();
142
+
143
+ const int32_t out_multipler_int32 = static_cast <int32_t >(out_multiplier);
144
+ const int32_t out_shift_int32 = static_cast <int32_t >(out_shift);
145
+
146
+ // The nnlib kernel to compute quantized linear via matmul.
147
+ const int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u (
148
+ out_data, // p_out
149
+ weight_data, // p_mat1,
150
+ in_data, // p_mat2,
151
+ bias_data, // p_bias
152
+ out_dim, // rows of p_mat1
153
+ in_dim, // cols of p_mat1
154
+ in_dim, // row_stride of p_mat1
155
+ leading_dims, // vec_count, i.e., rows of p_mat2
156
+ in_dim, // vec_offset of p_mat2.
157
+ out_dim, // out_offset, i.e., offset of next output element written
158
+ 1 , // out_stride, i.e., stride to go to next output row
159
+ -weight_zero_point, // mat1_zero_bias
160
+ -in_zero_point, // mat2_zero_bias
161
+ out_multipler_int32, // out_multiplier
162
+ out_shift_int32, // out_shift
163
+ out_zero_point); // out_zero_bias
164
+ ET_DCHECK_MSG (ret == 0 , " HiFi quantized::linear_per_tensor failed" );
165
+ }
166
+
167
+ void inline _quantized_linear_per_tensor_asym8s (
168
+ const Tensor& in,
169
+ const Tensor& weight,
170
+ const Tensor& bias,
171
+ int64_t in_zero_point,
172
+ int64_t weight_zero_point,
173
+ int64_t out_multiplier,
174
+ int64_t out_shift,
175
+ int64_t out_zero_point,
176
+ __ET_UNUSED const optional<Tensor>& offset,
177
+ Tensor& out) {
178
+ // input comes in shape [leading_dims, in_dim]
179
+ // weight comes in shape [out_dim, in_dim]
180
+ // output comes in empty with shape [leading_dims, out_dim]
181
+ // Perform matrix multiply (M x N) x (N x P)' => M x P
182
+ const int64_t leading_dims = getLeadingDims (in, in.dim () - 1 );
183
+ const int64_t out_dim = weight.size (0 ); // = out_dim
184
+ const int64_t in_dim = weight.size (1 ); // = in_dim
185
+
186
+ const int8_t * __restrict__ in_data = in.const_data_ptr <int8_t >();
187
+ const int8_t * __restrict__ weight_data = weight.const_data_ptr <int8_t >();
188
+ const int32_t * __restrict__ bias_data = bias.const_data_ptr <int32_t >();
189
+ int8_t * __restrict__ out_data = out.mutable_data_ptr <int8_t >();
190
+
191
+ const int32_t out_multipler_int32 = static_cast <int32_t >(out_multiplier);
192
+ const int32_t out_shift_int32 = static_cast <int32_t >(out_shift);
193
+
194
+ // The nnlib kernel to compute quantized linear via matmul.
195
+ const int32_t ret = xa_nn_matmul_asym8sxasym8s_asym8s (
196
+ out_data, // p_out
197
+ weight_data, // p_mat1,
198
+ in_data, // p_mat2,
199
+ bias_data, // p_bias
200
+ out_dim, // rows of p_mat1
201
+ in_dim, // cols of p_mat1
202
+ in_dim, // row_stride of p_mat1
203
+ leading_dims, // vec_count, i.e., rows of p_mat2
204
+ in_dim, // vec_offset of p_mat2.
205
+ out_dim, // out_offset, i.e., offset of next output element written
206
+ 1 , // out_stride, i.e., stride to go to next output row
207
+ -weight_zero_point, // mat1_zero_bias
208
+ -in_zero_point, // mat2_zero_bias
209
+ out_multipler_int32, // out_multiplier
210
+ out_shift_int32, // out_shift
211
+ out_zero_point); // out_zero_bias
212
+ ET_DCHECK_MSG (ret == 0 , " HiFi quantized::linear_per_tensor failed" );
213
+ }
214
+
215
+ void quantized_linear_out (
216
+ __ET_UNUSED KernelRuntimeContext& ctx,
217
+ const Tensor& in,
218
+ const Tensor& weight,
219
+ const Tensor& bias,
220
+ int64_t in_zero_point,
221
+ const Tensor& weight_zero_point,
222
+ const Tensor& out_multiplier,
223
+ const Tensor& out_shift,
224
+ int64_t out_zero_point,
225
+ __ET_UNUSED const optional<Tensor>& offset,
226
+ Tensor& out) {
227
+ if (out.scalar_type () == exec_aten::ScalarType::Byte) {
228
+ _quantized_linear_asym8u (
229
+ in,
230
+ weight,
231
+ bias,
232
+ in_zero_point,
233
+ weight_zero_point,
234
+ out_multiplier,
235
+ out_shift,
236
+ out_zero_point,
237
+ offset,
238
+ out);
239
+ } else if (out.scalar_type () == exec_aten::ScalarType::Char) {
240
+ _quantized_linear_asym8s (
241
+ in,
242
+ weight,
243
+ bias,
244
+ in_zero_point,
245
+ weight_zero_point,
246
+ out_multiplier,
247
+ out_shift,
248
+ out_zero_point,
249
+ offset,
250
+ out);
251
+ } else {
252
+ ET_CHECK_MSG (
253
+ false , " quantized linear only supported for uint8 and int8 dtypes" );
254
+ }
255
+ }
256
+
257
+ void quantized_linear_per_tensor_out (
258
+ __ET_UNUSED KernelRuntimeContext& ctx,
259
+ const Tensor& in,
260
+ const Tensor& weight,
261
+ const Tensor& bias,
262
+ int64_t in_zero_point,
263
+ int64_t weight_zero_point,
264
+ int64_t out_multiplier,
265
+ int64_t out_shift,
266
+ int64_t out_zero_point,
267
+ __ET_UNUSED const optional<Tensor>& offset,
268
+ Tensor& out) {
269
+ if (out.scalar_type () == exec_aten::ScalarType::Byte) {
270
+ _quantized_linear_per_tensor_asym8u (
271
+ in,
272
+ weight,
273
+ bias,
274
+ in_zero_point,
275
+ weight_zero_point,
276
+ out_multiplier,
277
+ out_shift,
278
+ out_zero_point,
279
+ offset,
280
+ out);
281
+ } else if (out.scalar_type () == exec_aten::ScalarType::Char) {
282
+ _quantized_linear_per_tensor_asym8s (
283
+ in,
284
+ weight,
285
+ bias,
286
+ in_zero_point,
287
+ weight_zero_point,
288
+ out_multiplier,
289
+ out_shift,
290
+ out_zero_point,
291
+ offset,
292
+ out);
293
+ } else {
294
+ ET_CHECK_MSG (
295
+ false , " quantized linear only supported for uint8 and int8 dtypes" );
296
+ }
297
+ }
298
+
70
299
}; // namespace native
71
300
}; // namespace HiFi
72
301
}; // namespace impl
0 commit comments