Skip to content

Commit 3366edf

Browse files
authored
migrate jarvis quant linear out hifi ops to oss
Differential Revision: D64841479 Pull Request resolved: #6490
1 parent 1b12971 commit 3366edf

File tree

4 files changed

+272
-18
lines changed

4 files changed

+272
-18
lines changed

backends/cadence/aot/functions_hifi.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,3 +125,7 @@
125125
kernels:
126126
- arg_meta: null
127127
kernel_name: cadence::impl::HiFi::quantized_linear_out
128+
- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
129+
kernels:
130+
- arg_meta: null
131+
kernel_name: cadence::impl::HiFi::quantized_linear_per_tensor_out

backends/cadence/aot/ops_registrations.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@
4343
lib.define(
4444
"quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)"
4545
)
46+
lib.define(
47+
"cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)"
48+
)
4649

4750
lib.define(
4851
"quantized_relu(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Y)"

backends/cadence/hifi/kernels/kernels.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,24 @@ WORD32 matmul_asym8uxasym8u_asym8u(
3838
WORD32 out_zero_bias,
3939
bool per_channel_quantized = false); // per-channel quantized weight
4040

41+
WORD32 xa_nn_matmul_asym8uxasym8u_asym8u(
42+
UWORD8* __restrict__ p_out,
43+
const UWORD8* __restrict__ p_mat1,
44+
const UWORD8* __restrict__ p_mat2,
45+
const WORD32* __restrict__ p_bias,
46+
WORD32 rows,
47+
WORD32 cols,
48+
WORD32 row_stride,
49+
WORD32 vec_count,
50+
WORD32 vec_offset,
51+
WORD32 out_offset,
52+
WORD32 out_stride,
53+
WORD32 mat1_zero_bias,
54+
WORD32 vec1_zero_bias,
55+
WORD32 out_multiplier,
56+
WORD32 out_shift,
57+
WORD32 out_zero_bias);
58+
4159
template <typename T>
4260
T quantize(const float x, float scale, int32_t zero_point);
4361

backends/cadence/hifi/operators/quantized_linear_out.cpp

Lines changed: 247 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,46 +7,51 @@
77
*/
88

99
#include <executorch/backends/cadence/hifi/kernels/kernels.h>
10+
#include <executorch/backends/cadence/hifi/operators/operators.h>
1011
#include <executorch/runtime/kernel/kernel_includes.h>
12+
#include <xa_nnlib_kernels_api.h>
13+
#include <xtensa/tie/xt_datacache.h>
1114
#include <algorithm>
1215
#include <cmath>
16+
#include <optional>
1317

1418
namespace cadence {
1519
namespace impl {
1620
namespace HiFi {
1721
namespace native {
1822

19-
using executorch::aten::Tensor;
20-
using executorch::runtime::getLeadingDims;
21-
using executorch::runtime::KernelRuntimeContext;
23+
using ::executorch::aten::optional;
24+
using ::executorch::aten::ScalarType;
25+
using ::executorch::aten::Tensor;
26+
using ::executorch::runtime::getLeadingDims;
27+
using ::executorch::runtime::KernelRuntimeContext;
2228

23-
void quantized_linear_out(
24-
KernelRuntimeContext& ctx,
25-
const Tensor& src,
29+
void _quantized_linear_asym8u(
30+
const Tensor& in,
2631
const Tensor& weight,
2732
const Tensor& bias,
28-
int64_t src_zero_point,
33+
int64_t in_zero_point,
2934
const Tensor& weight_zero_point,
3035
const Tensor& out_multiplier,
3136
const Tensor& out_shift,
3237
int64_t out_zero_point,
33-
const executorch::aten::optional<Tensor>& offset,
38+
__ET_UNUSED const optional<Tensor>& offset,
3439
Tensor& out) {
3540
// input comes in shape [leading_dims, in_dim]
3641
// weight comes in shape [out_dim, in_dim]
3742
// output comes in empty with shape [leading_dims, out_dim]
3843
// Perform matrix multiply (M x N) x (N x P)' => M x P
39-
int64_t leading_dims = getLeadingDims(src, src.dim() - 1);
40-
int64_t out_dim = weight.size(0); // = out_dim
41-
int64_t in_dim = weight.size(1); // = in_dim
44+
const int64_t leading_dims = getLeadingDims(in, in.dim() - 1);
45+
const int64_t out_dim = weight.size(0); // = out_dim
46+
const int64_t in_dim = weight.size(1); // = in_dim
4247

43-
const uint8_t* __restrict__ in_data = src.const_data_ptr<uint8_t>();
48+
const uint8_t* __restrict__ in_data = in.const_data_ptr<uint8_t>();
4449
const uint8_t* __restrict__ weight_data = weight.const_data_ptr<uint8_t>();
4550
const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
4651
uint8_t* __restrict__ out_data = out.mutable_data_ptr<uint8_t>();
4752

4853
// The nnlib kernel to compute quantized linear via matmul.
49-
int32_t ret = cadence::impl::HiFi::kernels::matmul_asym8uxasym8u_asym8u(
54+
int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u(
5055
out_data, // p_out
5156
weight_data, // p_mat1,
5257
in_data, // p_mat2,
@@ -59,14 +64,238 @@ void quantized_linear_out(
5964
out_dim, // out_offset, i.e., offset of next output element written
6065
1, // out_stride, i.e., stride to go to next output row
6166
-weight_zero_point.const_data_ptr<int32_t>()[0], // mat1_zero_bias
62-
-src_zero_point, // mat2_zero_bias
63-
out_multiplier.const_data_ptr<int32_t>(), // out_multiplier
64-
out_shift.const_data_ptr<int32_t>(), // out_shift
65-
out_zero_point, // out_zero_bias
66-
false); // per channel quantization
67+
-in_zero_point, // mat2_zero_bias
68+
out_multiplier.const_data_ptr<int32_t>()[0], // out_multiplier
69+
out_shift.const_data_ptr<int32_t>()[0], // out_shift
70+
out_zero_point); // out_zero_bias
6771
ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed");
6872
}
6973

74+
void inline _quantized_linear_asym8s(
75+
const Tensor& in,
76+
const Tensor& weight,
77+
const Tensor& bias,
78+
int64_t in_zero_point,
79+
const Tensor& weight_zero_point,
80+
const Tensor& out_multiplier,
81+
const Tensor& out_shift,
82+
int64_t out_zero_point,
83+
__ET_UNUSED const optional<Tensor>& offset,
84+
Tensor& out) {
85+
// input comes in shape [leading_dims, in_dim]
86+
// weight comes in shape [out_dim, in_dim]
87+
// output comes in empty with shape [leading_dims, out_dim]
88+
// Perform matrix multiply (M x N) x (N x P)' => M x P
89+
const int64_t leading_dims = getLeadingDims(in, in.dim() - 1);
90+
const int64_t out_dim = weight.size(0); // = out_dim
91+
const int64_t in_dim = weight.size(1); // = in_dim
92+
93+
const int8_t* __restrict__ in_data = in.const_data_ptr<int8_t>();
94+
const int8_t* __restrict__ weight_data = weight.const_data_ptr<int8_t>();
95+
const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
96+
int8_t* __restrict__ out_data = out.mutable_data_ptr<int8_t>();
97+
98+
// The nnlib kernel to compute quantized linear via matmul.
99+
int32_t ret = xa_nn_matmul_asym8sxasym8s_asym8s(
100+
out_data, // p_out
101+
weight_data, // p_mat1,
102+
in_data, // p_mat2,
103+
bias_data, // p_bias
104+
out_dim, // rows of p_mat1
105+
in_dim, // cols of p_mat1
106+
in_dim, // row_stride of p_mat1
107+
leading_dims, // vec_count, i.e., rows of p_mat2
108+
in_dim, // vec_offset of p_mat2.
109+
out_dim, // out_offset, i.e., offset of next output element written
110+
1, // out_stride, i.e., stride to go to next output row
111+
-weight_zero_point.const_data_ptr<int32_t>()[0], // mat1_zero_bias
112+
-in_zero_point, // mat2_zero_bias
113+
out_multiplier.const_data_ptr<int32_t>()[0], // out_multiplier
114+
out_shift.const_data_ptr<int32_t>()[0], // out_shift
115+
out_zero_point); // out_zero_bias
116+
ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed");
117+
}
118+
119+
void inline _quantized_linear_per_tensor_asym8u(
120+
const Tensor& in,
121+
const Tensor& weight,
122+
const Tensor& bias,
123+
int64_t in_zero_point,
124+
int64_t weight_zero_point,
125+
int64_t out_multiplier,
126+
int64_t out_shift,
127+
int64_t out_zero_point,
128+
__ET_UNUSED const optional<Tensor>& offset,
129+
Tensor& out) {
130+
// input comes in shape [leading_dims, in_dim]
131+
// weight comes in shape [out_dim, in_dim]
132+
// output comes in empty with shape [leading_dims, out_dim]
133+
// Perform matrix multiply (M x N) x (N x P)' => M x P
134+
const int64_t leading_dims = getLeadingDims(in, in.dim() - 1);
135+
const int64_t out_dim = weight.size(0); // = out_dim
136+
const int64_t in_dim = weight.size(1); // = in_dim
137+
138+
const uint8_t* __restrict__ in_data = in.const_data_ptr<uint8_t>();
139+
const uint8_t* __restrict__ weight_data = weight.const_data_ptr<uint8_t>();
140+
const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
141+
uint8_t* __restrict__ out_data = out.mutable_data_ptr<uint8_t>();
142+
143+
const int32_t out_multipler_int32 = static_cast<int32_t>(out_multiplier);
144+
const int32_t out_shift_int32 = static_cast<int32_t>(out_shift);
145+
146+
// The nnlib kernel to compute quantized linear via matmul.
147+
const int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u(
148+
out_data, // p_out
149+
weight_data, // p_mat1,
150+
in_data, // p_mat2,
151+
bias_data, // p_bias
152+
out_dim, // rows of p_mat1
153+
in_dim, // cols of p_mat1
154+
in_dim, // row_stride of p_mat1
155+
leading_dims, // vec_count, i.e., rows of p_mat2
156+
in_dim, // vec_offset of p_mat2.
157+
out_dim, // out_offset, i.e., offset of next output element written
158+
1, // out_stride, i.e., stride to go to next output row
159+
-weight_zero_point, // mat1_zero_bias
160+
-in_zero_point, // mat2_zero_bias
161+
out_multipler_int32, // out_multiplier
162+
out_shift_int32, // out_shift
163+
out_zero_point); // out_zero_bias
164+
ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear_per_tensor failed");
165+
}
166+
167+
void inline _quantized_linear_per_tensor_asym8s(
168+
const Tensor& in,
169+
const Tensor& weight,
170+
const Tensor& bias,
171+
int64_t in_zero_point,
172+
int64_t weight_zero_point,
173+
int64_t out_multiplier,
174+
int64_t out_shift,
175+
int64_t out_zero_point,
176+
__ET_UNUSED const optional<Tensor>& offset,
177+
Tensor& out) {
178+
// input comes in shape [leading_dims, in_dim]
179+
// weight comes in shape [out_dim, in_dim]
180+
// output comes in empty with shape [leading_dims, out_dim]
181+
// Perform matrix multiply (M x N) x (N x P)' => M x P
182+
const int64_t leading_dims = getLeadingDims(in, in.dim() - 1);
183+
const int64_t out_dim = weight.size(0); // = out_dim
184+
const int64_t in_dim = weight.size(1); // = in_dim
185+
186+
const int8_t* __restrict__ in_data = in.const_data_ptr<int8_t>();
187+
const int8_t* __restrict__ weight_data = weight.const_data_ptr<int8_t>();
188+
const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
189+
int8_t* __restrict__ out_data = out.mutable_data_ptr<int8_t>();
190+
191+
const int32_t out_multipler_int32 = static_cast<int32_t>(out_multiplier);
192+
const int32_t out_shift_int32 = static_cast<int32_t>(out_shift);
193+
194+
// The nnlib kernel to compute quantized linear via matmul.
195+
const int32_t ret = xa_nn_matmul_asym8sxasym8s_asym8s(
196+
out_data, // p_out
197+
weight_data, // p_mat1,
198+
in_data, // p_mat2,
199+
bias_data, // p_bias
200+
out_dim, // rows of p_mat1
201+
in_dim, // cols of p_mat1
202+
in_dim, // row_stride of p_mat1
203+
leading_dims, // vec_count, i.e., rows of p_mat2
204+
in_dim, // vec_offset of p_mat2.
205+
out_dim, // out_offset, i.e., offset of next output element written
206+
1, // out_stride, i.e., stride to go to next output row
207+
-weight_zero_point, // mat1_zero_bias
208+
-in_zero_point, // mat2_zero_bias
209+
out_multipler_int32, // out_multiplier
210+
out_shift_int32, // out_shift
211+
out_zero_point); // out_zero_bias
212+
ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear_per_tensor failed");
213+
}
214+
215+
void quantized_linear_out(
216+
__ET_UNUSED KernelRuntimeContext& ctx,
217+
const Tensor& in,
218+
const Tensor& weight,
219+
const Tensor& bias,
220+
int64_t in_zero_point,
221+
const Tensor& weight_zero_point,
222+
const Tensor& out_multiplier,
223+
const Tensor& out_shift,
224+
int64_t out_zero_point,
225+
__ET_UNUSED const optional<Tensor>& offset,
226+
Tensor& out) {
227+
if (out.scalar_type() == exec_aten::ScalarType::Byte) {
228+
_quantized_linear_asym8u(
229+
in,
230+
weight,
231+
bias,
232+
in_zero_point,
233+
weight_zero_point,
234+
out_multiplier,
235+
out_shift,
236+
out_zero_point,
237+
offset,
238+
out);
239+
} else if (out.scalar_type() == exec_aten::ScalarType::Char) {
240+
_quantized_linear_asym8s(
241+
in,
242+
weight,
243+
bias,
244+
in_zero_point,
245+
weight_zero_point,
246+
out_multiplier,
247+
out_shift,
248+
out_zero_point,
249+
offset,
250+
out);
251+
} else {
252+
ET_CHECK_MSG(
253+
false, "quantized linear only supported for uint8 and int8 dtypes");
254+
}
255+
}
256+
257+
void quantized_linear_per_tensor_out(
258+
__ET_UNUSED KernelRuntimeContext& ctx,
259+
const Tensor& in,
260+
const Tensor& weight,
261+
const Tensor& bias,
262+
int64_t in_zero_point,
263+
int64_t weight_zero_point,
264+
int64_t out_multiplier,
265+
int64_t out_shift,
266+
int64_t out_zero_point,
267+
__ET_UNUSED const optional<Tensor>& offset,
268+
Tensor& out) {
269+
if (out.scalar_type() == exec_aten::ScalarType::Byte) {
270+
_quantized_linear_per_tensor_asym8u(
271+
in,
272+
weight,
273+
bias,
274+
in_zero_point,
275+
weight_zero_point,
276+
out_multiplier,
277+
out_shift,
278+
out_zero_point,
279+
offset,
280+
out);
281+
} else if (out.scalar_type() == exec_aten::ScalarType::Char) {
282+
_quantized_linear_per_tensor_asym8s(
283+
in,
284+
weight,
285+
bias,
286+
in_zero_point,
287+
weight_zero_point,
288+
out_multiplier,
289+
out_shift,
290+
out_zero_point,
291+
offset,
292+
out);
293+
} else {
294+
ET_CHECK_MSG(
295+
false, "quantized linear only supported for uint8 and int8 dtypes");
296+
}
297+
}
298+
70299
}; // namespace native
71300
}; // namespace HiFi
72301
}; // namespace impl

0 commit comments

Comments
 (0)