Skip to content

Commit 78b60df

Browse files
authored
Reland cadence quantized_linear_per_tensor_out cpu 1eb924f^..fd33294 (#7204)
Revert "Revert cadence quantized_linear_per_tensor_out cpu1eb924f^..fd33294 (…" This reverts commit a9565aa.
1 parent 6c93287 commit 78b60df

File tree

9 files changed

+926
-100
lines changed

9 files changed

+926
-100
lines changed

backends/cadence/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ include(${EXECUTORCH_ROOT}/build/Utils.cmake)
2323

2424
# Let files say "include <executorch/path/to/header.h>".
2525
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
26-
set(TARGET_DIR reference)
2726

2827
if(EXECUTORCH_CADENCE_CPU_RUNNER)
2928
include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
@@ -61,6 +60,9 @@ if(EXECUTORCH_CADENCE_CPU_RUNNER)
6160
${_common_include_directories}
6261
)
6362

63+
set(TARGET_DIR reference)
64+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
65+
6466
target_link_libraries(
6567
cadence_runner
6668
executorch

backends/cadence/aot/functions.yaml

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,41 @@
142142
- arg_meta: null
143143
kernel_name: torch::executor::where_out
144144

145+
- op: transpose_copy.int_out
146+
kernels:
147+
- arg_meta: null
148+
kernel_name: torch::executor::transpose_copy_int_out
149+
150+
- op: eq.Scalar_out
151+
kernels:
152+
- arg_meta: null
153+
kernel_name: torch::executor::eq_scalar_out
154+
155+
- op: logical_not.out
156+
kernels:
157+
- arg_meta: null
158+
kernel_name: torch::executor::logical_not_out
159+
160+
- op: any.out
161+
kernels:
162+
- arg_meta: null
163+
kernel_name: torch::executor::any_out
164+
165+
- op: native_group_norm.out
166+
kernels:
167+
- arg_meta: null
168+
kernel_name: torch::executor::native_group_norm_out
169+
170+
- op: sum.IntList_out
171+
kernels:
172+
- arg_meta: null
173+
kernel_name: torch::executor::sum_dim_out
174+
175+
- op: select_copy.int_out
176+
kernels:
177+
- arg_meta: null
178+
kernel_name: torch::executor::select_copy_int_out
179+
145180
# custom ops
146181
- func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
147182
variants: function
@@ -183,3 +218,18 @@
183218
kernels:
184219
- arg_meta: null
185220
kernel_name: impl::reference::quantized_matmul_out
221+
222+
- func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
223+
kernels:
224+
- arg_meta: null
225+
kernel_name: impl::reference::quantized_linear_per_tensor_out
226+
227+
- func: cadence::im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, Tensor in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
228+
kernels:
229+
- arg_meta: null
230+
kernel_name: impl::reference::im2row_out
231+
232+
- func: cadence::quantized_conv.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
233+
kernels:
234+
- arg_meta: null
235+
kernel_name: impl::reference::quantized_conv_per_tensor_out

backends/cadence/reference/operators/CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,16 @@ set(_aten_ops__srcs
5555
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_expand_copy.cpp"
5656
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gelu.cpp"
5757
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_empty.cpp"
58+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_transpose_copy.cpp"
59+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_eq.cpp"
60+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_logical_not.cpp"
61+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_any.cpp"
62+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_native_group_norm.cpp"
63+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_sum.cpp"
64+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_select_copy.cpp"
65+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
66+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/normalization_ops_util.cpp"
67+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/select_copy_util.cpp"
5868
)
5969
add_library(aten_ops_cadence ${_aten_ops__srcs})
6070
target_link_libraries(aten_ops_cadence PUBLIC executorch)
@@ -78,6 +88,7 @@ add_library(
7888
"quantize_per_tensor.cpp"
7989
"dequantize_per_tensor.cpp"
8090
"quantized_matmul_out.cpp"
91+
"im2row_out.cpp"
8192
)
8293
target_include_directories(
8394
custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2+
3+
#include <executorch/backends/cadence/reference/operators/operators.h>
4+
5+
#include <algorithm>
6+
7+
namespace impl {
8+
namespace reference {
9+
namespace native {
10+
11+
using ::executorch::aten::IntArrayRef;
12+
using ::executorch::aten::ScalarType;
13+
using ::executorch::aten::Tensor;
14+
using ::executorch::runtime::KernelRuntimeContext;
15+
16+
template <typename T>
17+
__attribute__((always_inline)) void im2row_(
18+
const T* __restrict__ data_im,
19+
const int32_t in_zero_point,
20+
/* input parameters*/
21+
const int32_t channels,
22+
const int32_t height,
23+
const int32_t width,
24+
/* output parameters */
25+
const int32_t out_height,
26+
const int32_t out_width,
27+
/* convolution parameters */
28+
const int32_t kernel_h,
29+
const int32_t kernel_w,
30+
const int32_t pad_h,
31+
const int32_t pad_w,
32+
const int32_t stride_h,
33+
const int32_t stride_w,
34+
const int32_t dilation_h,
35+
const int32_t dilation_w,
36+
T* __restrict__ data_col,
37+
bool channels_last) {
38+
// Consider convolving the input image of dimensions channels * height * width
39+
// (or height * width * channels for NHWC layout) with a filter of dimensions
40+
// channels * kernels_h * kernels_w. Assume that this convolution will produce
41+
// an output of dimensinos out_height x out_width. For each point the output,
42+
// im2row takes the data from the input that is used in the computation of
43+
// that output point, and flattens it into a vector of size channels_col =
44+
// channels * kernel_h * kernel_w. The output of im2row will therefore be a 2D
45+
// array of size (out_height * out_width) x channels_col
46+
const int32_t channels_col = channels * kernel_h * kernel_w;
47+
48+
// If the layout is NHWC, we can copy 'channels' worth of contiguous data
49+
// points when performing im2row.
50+
if (channels_last) {
51+
// Iterate over the output domain
52+
for (int _h = 0; _h < out_height; ++_h) {
53+
for (int _w = 0; _w < out_width; ++_w) {
54+
int32_t i_col = _h * out_width + _w;
55+
// Each point in the output domain is the result of applying a filter of
56+
// size kernel_h x kernel_w x channels on the input. But since channels
57+
// is contiguous, we will not explicitly have a loop for it.
58+
for (int _kh = 0; _kh < kernel_h; ++_kh) {
59+
int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h;
60+
for (int _kw = 0; _kw < kernel_w; ++_kw) {
61+
int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w;
62+
63+
// h_im and w_im are the actual height and width coordinates of the
64+
// input tensor from where we need to copy 'channels' points.
65+
const T* __restrict__ slice_im =
66+
data_im + (h_im * width + w_im) * channels;
67+
T* __restrict__ slice_col = data_col + i_col * channels_col +
68+
(_kh * kernel_w + _kw) * channels;
69+
// If the coordinates were within the input domain, we copy
70+
// 'channels' contiguous values. Otherwise we will fill the output
71+
// with 0's.
72+
if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
73+
std::memcpy(slice_col, slice_im, channels * sizeof(T));
74+
} else {
75+
std::fill_n(slice_col, channels, T(in_zero_point));
76+
}
77+
}
78+
}
79+
}
80+
}
81+
} else {
82+
// Iterate over the output domain
83+
for (int _h = 0; _h < out_height; ++_h) {
84+
for (int _w = 0; _w < out_width; ++_w) {
85+
int32_t i_col = _h * out_width + _w;
86+
87+
// Each point in the output domain is the result of applying a filter
88+
// of size chanenls * kernel_h x kernel_w on the input
89+
for (int _c = 0; _c < channels; ++_c) {
90+
for (int _kh = 0; _kh < kernel_h; ++_kh) {
91+
for (int _kw = 0; _kw < kernel_w; ++_kw) {
92+
// c_col is the linearized access in the channels_col vector.
93+
int32_t c_col = (_c * kernel_h + _kh) * kernel_w + _kw;
94+
// h_im and w_im are the actual height and width coordinates of
95+
// the input tensor that we need to copy to the output.
96+
int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h;
97+
int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w;
98+
// If the current data access is within the input tensor, copy the
99+
// value
100+
data_col[i_col * channels_col + c_col] =
101+
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width)
102+
? data_im[(_c * height + h_im) * width + w_im]
103+
: static_cast<T>(in_zero_point);
104+
}
105+
}
106+
}
107+
}
108+
}
109+
}
110+
}
111+
112+
void im2row_out(
113+
__ET_UNUSED KernelRuntimeContext& ctx,
114+
const Tensor& input,
115+
IntArrayRef kernel_size,
116+
IntArrayRef dilation,
117+
IntArrayRef padding,
118+
IntArrayRef stride,
119+
const Tensor& in_zero_point,
120+
bool channel_last,
121+
Tensor& out) {
122+
// Compute the input tensor's dims
123+
bool unit_height = input.dim() == 3;
124+
const int32_t batch_size = input.size(0);
125+
const int32_t in_c =
126+
channel_last ? input.size(3 - unit_height) : input.size(1);
127+
const int32_t in_h =
128+
unit_height ? 1 : (channel_last ? input.size(1) : input.size(2));
129+
const int32_t in_w =
130+
channel_last ? input.size(2 - unit_height) : input.size(3 - unit_height);
131+
132+
// Get the kernel parameters
133+
int32_t kernel_h = kernel_size[0];
134+
int32_t kernel_w = kernel_size[1];
135+
int32_t dilation_h = dilation[0];
136+
int32_t dilation_w = dilation[1];
137+
int32_t pad_h = padding[0];
138+
int32_t pad_w = padding[1];
139+
int32_t stride_h = stride[0];
140+
int32_t stride_w = stride[1];
141+
142+
// If we were to apply a convolution on the input tensor, compute the output
143+
// height and width.
144+
int32_t out_h =
145+
(in_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1;
146+
int32_t out_w =
147+
(in_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
148+
149+
ET_DCHECK_MSG(
150+
(out_h * out_w) == out.size(1), "dimension mismatch for output");
151+
ET_DCHECK_MSG(
152+
(kernel_h * kernel_w * in_c) == out.size(2),
153+
"dimension mismatch for output");
154+
155+
// Check if the input is per-tensor quantized or per-channel quantized. The
156+
// zero point for each batch could differ for per-channel quantized input.
157+
bool per_tensor_quantized = in_zero_point.numel() == 1;
158+
159+
#define typed_im2row(dtype, ctype) \
160+
case ScalarType::dtype: { \
161+
const ctype* __restrict__ in_data = input.const_data_ptr<ctype>(); \
162+
ctype* __restrict__ out_data = out.mutable_data_ptr<ctype>(); \
163+
const int32_t* __restrict__ zero_point = \
164+
in_zero_point.const_data_ptr<int32_t>(); \
165+
int32_t in_plane = in_c * in_h * in_w; \
166+
int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w; \
167+
for (size_t n = 0; n < batch_size; ++n) { \
168+
im2row_<ctype>( \
169+
&in_data[n * in_plane], \
170+
per_tensor_quantized ? zero_point[0] : zero_point[n], \
171+
in_c, \
172+
in_h, \
173+
in_w, \
174+
out_h, \
175+
out_w, \
176+
kernel_h, \
177+
kernel_w, \
178+
pad_h, \
179+
pad_w, \
180+
stride_h, \
181+
stride_w, \
182+
dilation_h, \
183+
dilation_w, \
184+
&out_data[n * out_plane], \
185+
channel_last); \
186+
} \
187+
break; \
188+
}
189+
190+
ScalarType dtype = input.scalar_type();
191+
switch (dtype) {
192+
typed_im2row(Float, float);
193+
typed_im2row(Byte, uint8_t);
194+
typed_im2row(Char, int8_t);
195+
default:
196+
ET_DCHECK_MSG(
197+
false,
198+
"im2row not implemented for dtype %s",
199+
torch::executor::toString(dtype));
200+
}
201+
#undef typed_im2row
202+
}
203+
204+
} // namespace native
205+
} // namespace reference
206+
} // namespace impl
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2+
3+
#pragma once
4+
5+
#include <executorch/runtime/core/array_ref.h>
6+
#include <executorch/runtime/core/exec_aten/exec_aten.h>
7+
#include <executorch/runtime/kernel/kernel_includes.h>
8+
#include <optional>
9+
10+
namespace cadence {
11+
namespace impl {
12+
namespace cpu {
13+
namespace native {
14+
namespace {
15+
using ::executorch::runtime::getLeadingDims;
16+
17+
#define ET_FORALL_CADENCE_QUANTIZED_TYPES(_) \
18+
_(uint8_t, Byte) \
19+
_(int8_t, Char)
20+
21+
inline __attribute__((always_inline)) void linear_(
22+
const ::executorch::aten::Tensor& input,
23+
const ::executorch::aten::Tensor& weight,
24+
const ::executorch::aten::optional<::executorch::aten::Tensor>& bias,
25+
::executorch::aten::Tensor& output) {
26+
const float* __restrict__ input_data = input.const_data_ptr<float>();
27+
const float* __restrict__ weight_data = weight.const_data_ptr<float>();
28+
const float* __restrict__ bias_data = bias.value().const_data_ptr<float>();
29+
float* __restrict__ output_data = output.mutable_data_ptr<float>();
30+
31+
// input comes in shape [batch_size, in_dim]
32+
// weight comes in shape [out_dim, in_dim]
33+
// output comes in empty with shape [batch_size, out_dim]
34+
// Perform matrix multiply (M x N) x (N x P) => M x P
35+
int64_t M = weight.size(0); // = out_dim
36+
int64_t N = weight.size(1); // = in_dim
37+
38+
// Given an N-dimensional input [d0, d1, d2, ..., d_{N-2}, d_{N-1}], the
39+
// leading dimensions is d0 * d1 * ... * d_{N-2}
40+
int64_t leading_dims = getLeadingDims(input, input.dim() - 1);
41+
42+
for (int i = 0; i < leading_dims; ++i) {
43+
for (int j = 0; j < M; ++j) {
44+
float sum = bias_data[j];
45+
for (int k = 0; k < N; ++k) {
46+
sum += input_data[i * N + k] * weight_data[j * N + k];
47+
}
48+
output_data[i * M + j] = sum;
49+
}
50+
}
51+
}
52+
53+
} // namespace
54+
} // namespace native
55+
} // namespace cpu
56+
} // namespace impl
57+
} // namespace cadence

0 commit comments

Comments
 (0)