[Cadence] add reference requantize out and tests

zonglinpeng · web-flow · commit 9dcee22c1fe8 · 2025-03-11T14:10:28.000-07:00
Differential Revision: D70906707 Pull Request resolved: #9097
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
@@ -115,11 +115,23 @@ python_library(
     ],
     deps = [
         "fbcode//caffe2:torch",
-        "fbcode//executorch/exir:scalar_type",
         "fbcode//executorch/backends/cadence/aot:utils",
     ],
 )
 
+python_library(
+    name = "ref_implementations",
+    srcs = [
+        "ref_implementations.py",
+    ],
+    typing = True,
+    deps = [
+        "fbcode//caffe2:torch",
+        "fbcode//executorch/exir:scalar_type",
+    ],
+)
+
+
 export_file(name = "functions.yaml")
 
 executorch_generated_lib(
diff --git a/backends/cadence/aot/export_example.py b/backends/cadence/aot/export_example.py
@@ -38,6 +38,8 @@ def export_model(
     example_inputs: Tuple[Any, ...],
     file_name: str = "CadenceDemoModel",
     run_and_compare: bool = True,
+    eps_error: float = 1e-1,
+    eps_warn: float = 1e-5,
 ):
     # create work directory for outputs and model binary
     working_dir = tempfile.mkdtemp(dir="/tmp")
@@ -89,4 +91,6 @@ def export_model(
             inputs=example_inputs,
             ref_outputs=ref_outputs,
             working_dir=working_dir,
+            eps_error=eps_error,
+            eps_warn=eps_warn,
         )
diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
@@ -248,3 +248,8 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::reference::quantized_fully_connected_per_tensor_out
+
+- func: cadence::requantize.out(Tensor input, Tensor in_scale, Tensor in_zero_point, Tensor out_scale, Tensor out_zero_point, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::requantize_out
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
@@ -94,7 +94,6 @@
     "int[] dilation, SymInt[] output_padding, int groups, bool channel_last=False) -> (Tensor Y)"
 )
 lib.define("dequantize(Tensor X, Tensor X_scale, Tensor X_zero_point) -> (Tensor Y)")
-# cadence::quantized_relu is defined in OSS
 lib.define(
     "quantized_add(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor Y, Tensor Y_scale, "
     "Tensor Y_zero_point, float out_scale, int out_zero_point) -> (Tensor Z)"
@@ -119,8 +118,6 @@
     "quantized_embedding_byte(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, "
     "Tensor indices, bool pruned_weights=False) -> (Tensor X)"
 )
-# cadence::quantized_layer_norm is defined in OSS
-# cadence::quantized_conv is defined is OSS
 lib.define(
     "quantized_transposed_conv(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, "
     "int[] dilation, SymInt[] output_padding, int groups, int input_zero_point, Tensor weight_zero_point, "
@@ -156,7 +153,7 @@
 )
 
 # ------------------------------------ #
-#   Migrated from custom_ops.ymal      #
+#   Migrated from custom_ops.yaml      #
 # ------------------------------------ #
 # Migrated from the custom_ops.yaml files containing different operator variants (e.g., .out, .tensor_out)
 lib.define(
@@ -167,7 +164,6 @@
     "transposed_convolution.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, "
     "int[] dilation, SymInt[] output_padding, int groups, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
-# cadence::quantized_relu.out is defined in OSS
 lib.define(
     "quantized_relu.per_tensor(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift) -> Tensor"
 )
@@ -265,14 +261,12 @@
     "_cat_nop.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
-# Custom ops with jarvis_nn_ops namespace
+# Custom ops with cadence_nn_ops namespace
 jarvis_nn_lib = Library("jarvis_nn_ops", "DEF")
 jarvis_nn_lib.define(
     "attention_mask.out(Tensor input, Tensor start, Tensor stop, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
-m = Library("cadence", "IMPL", "Meta")
-
 
 @register_fake("cadence::quantize_per_tensor")
 def quantize_per_tensor_meta(
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import torch
+from executorch.exir.scalar_type import ScalarType
+from torch.library import impl, Library
+
+
+m = Library("cadence", "IMPL", "CompositeExplicitAutograd")
+
+qdtype_map: dict[ScalarType, torch.dtype] = {
+    ScalarType.QINT8: torch.qint8,
+    ScalarType.QUINT8: torch.quint8,
+    ScalarType.QINT32: torch.qint32,
+}
+
+
+@impl(m, "requantize")
+def requantize(
+    input: torch.Tensor,
+    in_scale: torch.Tensor,
+    in_zero_point: torch.Tensor,
+    out_scale: torch.Tensor,
+    out_zero_point: torch.Tensor,
+    dtype: ScalarType,
+) -> torch.Tensor:
+    if dtype in qdtype_map:
+        # Old quantization mechanism
+        return torch.quantize_per_tensor(
+            torch.dequantize(input), out_scale, out_zero_point, qdtype_map[dtype]
+        )
+
+    # For in_scale or out_scale other than scalar, it requires quant/dequant
+    # per channel, but the channel dimension value is missing
+    if in_scale.numel() > 1 or out_scale.numel() > 1:
+        raise NotImplementedError("Only scalar scales are supported")
+
+    quant_min = torch.iinfo(input.dtype).min
+    quant_max = torch.iinfo(input.dtype).max
+    # pyre-fixme[6]: This dtype is actually the right one.
+    out_quant_min = torch.iinfo(dtype).min
+    # pyre-fixme[6]: This dtype is actually the right one.
+    out_quant_max = torch.iinfo(dtype).max
+    return torch.ops.quantized_decomposed.quantize_per_tensor(
+        torch.ops.quantized_decomposed.dequantize_per_tensor(
+            input,
+            in_scale.flatten()[0],
+            in_zero_point.flatten()[0],
+            quant_min,
+            quant_max,
+            input.dtype,
+        ),
+        out_scale.flatten()[0],
+        out_zero_point.flatten()[0],
+        out_quant_min,
+        out_quant_max,
+        dtype,
+    )
diff --git a/backends/cadence/reference/kernels/kernels.cpp b/backends/cadence/reference/kernels/kernels.cpp
@@ -58,6 +58,36 @@ void dequantize(
   }
 }
 
+// Requantize the int8_t/uint8_t in value to a uint8_t/int8_t out value.
+// The scale and zero_point for requantization are in the args.
+template <typename IT, typename OT>
+OT requantize(
+    const IT in,
+    float in_scale,
+    int32_t in_zero_point,
+    float inv_out_scale,
+    int32_t out_zero_point) {
+  float dequant = dequantize<IT>(in, in_scale, in_zero_point);
+  return quantize<OT>(dequant, inv_out_scale, out_zero_point);
+}
+
+// Requantize the int8_t/uint8_t in array to a uint8_t/int8_t out array.
+// The scale and zero_point for requantization are in the args.
+template <typename IT, typename OT>
+void requantize(
+    OT* __restrict__ out,
+    const IT* __restrict__ in,
+    float in_scale,
+    int32_t in_zero_point,
+    float inv_out_scale,
+    int32_t out_zero_point,
+    size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    out[i] = requantize<IT, OT>(
+        in[i], in_scale, in_zero_point, inv_out_scale, out_zero_point);
+  }
+}
+
 // explicit template instantiation
 
 #define typed_quantize_val(dtype) \
@@ -106,6 +136,58 @@ typed_dequantize_vec(uint16_t);
 typed_dequantize_vec(int32_t);
 #undef typed_dequantize_vec
 
+#define typed_requantize_val(itype, otype) \
+  template otype requantize(               \
+      const itype in,                      \
+      float in_scale,                      \
+      int32_t in_zero_point,               \
+      float inv_out_scale,                 \
+      int32_t out_zero_point);
+typed_requantize_val(int8_t, int8_t);
+typed_requantize_val(int8_t, uint8_t);
+typed_requantize_val(int8_t, int16_t);
+typed_requantize_val(int8_t, uint16_t);
+typed_requantize_val(uint8_t, int8_t);
+typed_requantize_val(uint8_t, uint8_t);
+typed_requantize_val(uint8_t, int16_t);
+typed_requantize_val(uint8_t, uint16_t);
+typed_requantize_val(int16_t, int8_t);
+typed_requantize_val(int16_t, uint8_t);
+typed_requantize_val(int16_t, int16_t);
+typed_requantize_val(int16_t, uint16_t);
+typed_requantize_val(uint16_t, int8_t);
+typed_requantize_val(uint16_t, uint8_t);
+typed_requantize_val(uint16_t, int16_t);
+typed_requantize_val(uint16_t, uint16_t);
+#undef typed_requantize_val
+
+#define typed_requantize_vec(itype, otype) \
+  template void requantize(                \
+      otype* __restrict__ out,             \
+      const itype* __restrict__ in,        \
+      float in_scale,                      \
+      int32_t in_zero_point,               \
+      float inv_out_scale,                 \
+      int32_t out_zero_point,              \
+      size_t size);
+typed_requantize_vec(int8_t, int8_t);
+typed_requantize_vec(int8_t, uint8_t);
+typed_requantize_vec(int8_t, int16_t);
+typed_requantize_vec(int8_t, uint16_t);
+typed_requantize_vec(uint8_t, int8_t);
+typed_requantize_vec(uint8_t, uint8_t);
+typed_requantize_vec(uint8_t, int16_t);
+typed_requantize_vec(uint8_t, uint16_t);
+typed_requantize_vec(int16_t, int8_t);
+typed_requantize_vec(int16_t, uint8_t);
+typed_requantize_vec(int16_t, int16_t);
+typed_requantize_vec(int16_t, uint16_t);
+typed_requantize_vec(uint16_t, int8_t);
+typed_requantize_vec(uint16_t, uint8_t);
+typed_requantize_vec(uint16_t, int16_t);
+typed_requantize_vec(uint16_t, uint16_t);
+#undef typed_requantize_vec
+
 }; // namespace kernels
 }; // namespace reference
 }; // namespace impl
diff --git a/backends/cadence/reference/kernels/kernels.h b/backends/cadence/reference/kernels/kernels.h
@@ -36,6 +36,24 @@ void dequantize(
     int32_t zero_point,
     size_t size);
 
+template <typename IT, typename OT>
+OT requantize(
+    const IT in,
+    float in_scale,
+    int32_t in_zero_point,
+    float inv_out_scale,
+    int32_t out_zero_point);
+
+template <typename IT, typename OT>
+void requantize(
+    OT* __restrict__ out,
+    const IT* __restrict__ in,
+    float in_scale,
+    int32_t in_zero_point,
+    float inv_out_scale,
+    int32_t out_zero_point,
+    size_t size);
+
 }; // namespace kernels
 }; // namespace reference
 }; // namespace impl
diff --git a/backends/cadence/reference/operators/CMakeLists.txt b/backends/cadence/reference/operators/CMakeLists.txt
@@ -90,6 +90,7 @@ add_library(
   "quantized_fully_connected_out.cpp"
   "dequantize_per_tensor.cpp"
   "quantized_matmul_out.cpp"
+  "requantize_out.cpp"
   "im2row_out.cpp"
 )
 target_include_directories(
diff --git a/backends/cadence/reference/operators/requantize_out.cpp b/backends/cadence/reference/operators/requantize_out.cpp
diff --git a/examples/cadence/operators/targets.bzl b/examples/cadence/operators/targets.bzl
diff --git a/examples/cadence/operators/test_requantize_op.py b/examples/cadence/operators/test_requantize_op.py

Original file line number	Diff line number	Diff line change
`@@ -90,6 +90,7 @@ add_library(`
`90`	`90`	`"quantized_fully_connected_out.cpp"`
`91`	`91`	`"dequantize_per_tensor.cpp"`
`92`	`92`	`"quantized_matmul_out.cpp"`
	`93`	`+ "requantize_out.cpp"`
`93`	`94`	`"im2row_out.cpp"`
`94`	`95`	`)`
`95`	`96`	`target_include_directories(`