buckify g3 targets, fix issues in quant, dequant, softmax, Replace Bits16 with UInt16 (#7061)

zonglinpeng · facebook-github-bot · commit 29b7176e493b · 2024-12-05T18:04:11.000-08:00
Summary: Pull Request resolved: #7061 update targets in fallback, fixed inherent issues from G3 PR. G3 op status page: https://docs.google.com/document/d/1ZRW6Uoq_NhpVCSH4y-t3Bl2pQZiKXMzSNT5XgrbE0fM/edit?tab=t.0 included D66834249, D66681284 Reviewed By: hsharma35 Differential Revision: D66398494
diff --git a/backends/cadence/aot/functions_fusion_g3.yaml b/backends/cadence/aot/functions_fusion_g3.yaml
@@ -20,7 +20,7 @@
 - op: _softmax.out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::G3::softmax_out
+      kernel_name: cadence::impl::G3::_softmax_out
 
 - op: add.out
   kernels:
@@ -71,7 +71,7 @@
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::G3::mul_out
-      
+
 - op: mul.Scalar_out
   kernels:
     - arg_meta: null
@@ -111,8 +111,21 @@
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::where_out
-      
+
 - op: native_layer_norm.out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::G3::native_layer_norm_out     
+      kernel_name: cadence::impl::G3::native_layer_norm_out
+
+# custom ops
+- func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::G3::native::quantize_per_tensor_out
+
+- func: cadence::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::G3::native::dequantize_per_tensor_out
diff --git a/backends/cadence/fusion_g3/operators/TARGETS b/backends/cadence/fusion_g3/operators/TARGETS
@@ -0,0 +1,5 @@
+load("targets.bzl", "define_common_targets")
+
+oncall("odai_jarvis")
+
+define_common_targets()
diff --git a/backends/cadence/fusion_g3/operators/op_dequantize.cpp b/backends/cadence/fusion_g3/operators/op_dequantize.cpp
@@ -52,7 +52,7 @@ void check_dequantize_per_tensor_args(
   ET_CHECK_MSG(
       input.scalar_type() == ScalarType::Byte ||
           input.scalar_type() == ScalarType::Char ||
-          input.scalar_type() == ScalarType::Bits16 ||
+          input.scalar_type() == ScalarType::UInt16 ||
           input.scalar_type() == ScalarType::Short ||
           input.scalar_type() == (ScalarType)Ushort ||
           input.scalar_type() == (ScalarType)Bits4 ||
@@ -83,7 +83,7 @@ void check_dequantize_per_tensor_args(
 } // namespace
 
 /* Local function which calls the kernels based on the input datatype */
-void Dequantize_impl(
+void dequantize_impl(
     Tensor& out,
     const Tensor& input,
     float* scale_data,
@@ -211,7 +211,7 @@ void Dequantize_impl(
     break;
         switch (input.scalar_type()) {
           ET_FORALL_INT_TYPES(ASYM_CALCULATE_INT_TYPE_TENSOR);
-          ASYM_CALCULATE_INT_TYPE_TENSOR(uint16_t, Bits16);
+          ASYM_CALCULATE_INT_TYPE_TENSOR(uint16_t, UInt16);
           default:
             ET_CHECK_MSG(
                 false,
@@ -302,7 +302,7 @@ void Dequantize_impl(
     break;
         switch (input.scalar_type()) {
           ET_FORALL_INT_TYPES(ASYM_CALCULATE_INT_TYPE_CHANNEL);
-          ASYM_CALCULATE_INT_TYPE_CHANNEL(uint16_t, Bits16);
+          ASYM_CALCULATE_INT_TYPE_CHANNEL(uint16_t, UInt16);
           default:
             ET_CHECK_MSG(
                 false,
@@ -368,7 +368,7 @@ void Dequantize_impl(
     break;
         switch (input.scalar_type()) {
           ET_FORALL_INT_TYPES(SYM_CALCULATE_INT_TYPE_TENSOR);
-          SYM_CALCULATE_INT_TYPE_TENSOR(uint16_t, Bits16);
+          SYM_CALCULATE_INT_TYPE_TENSOR(uint16_t, UInt16);
           default:
             ET_CHECK_MSG(
                 false,
@@ -459,7 +459,7 @@ void Dequantize_impl(
     break;
         switch (input.scalar_type()) {
           ET_FORALL_INT_TYPES(SYM_CALCULATE_INT_TYPE_CHANNEL);
-          SYM_CALCULATE_INT_TYPE_CHANNEL(uint16_t, Bits16);
+          SYM_CALCULATE_INT_TYPE_CHANNEL(uint16_t, UInt16);
           default:
             ET_CHECK_MSG(
                 false,
@@ -502,7 +502,7 @@ Tensor& dequantize_per_tensor_out(
   float scale_data = (float)scale;
   int zero_point_data = (int)zero_point;
 
-  Dequantize_impl(out, input, &scale_data, &zero_point_data, NULL, out_dtype);
+  dequantize_impl(out, input, &scale_data, &zero_point_data, NULL, out_dtype);
 
   return out;
 }
@@ -620,7 +620,7 @@ Tensor& dequantize_per_channel_out(
   for (int i = 0; i < scale.numel(); i++) {
     scale_data[i] = (float)scale_dt[i];
   }
-  Dequantize_impl(out, input, scale_data, zero_point_ptr, axis_ptr, out_dtype);
+  dequantize_impl(out, input, scale_data, zero_point_ptr, axis_ptr, out_dtype);
 
   return out;
 }
@@ -661,13 +661,19 @@ Tensor& dequantize_per_tensor_out(
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
-    exec_aten::optional<ScalarType> out_dtype,
     Tensor& out) {
   // TODO(larryliu): Add a context arg to the real op function and remove this
   // wrapper
   (void)context;
   return dequantize_per_tensor_out(
-      input, scale, zero_point, quant_min, quant_max, dtype, out_dtype, out);
+      input,
+      scale,
+      zero_point,
+      quant_min,
+      quant_max,
+      dtype,
+      out.scalar_type(),
+      out);
 }
 
 Tensor& dequantize_per_tensor_tensor_args_out(
@@ -764,4 +770,4 @@ Tensor& dequantize_per_token_out(
 } // namespace native
 } // namespace G3
 } // namespace impl
-} // namespace cadence
+} // namespace cadence
diff --git a/backends/cadence/fusion_g3/operators/op_quantize.cpp b/backends/cadence/fusion_g3/operators/op_quantize.cpp
@@ -69,7 +69,7 @@ void check_quantize_per_tensor_args(
         static_cast<int32_t>(std::numeric_limits<int8_t>::min());
     quant_max_upper_bound =
         static_cast<int32_t>(std::numeric_limits<int8_t>::max());
-  } else if (dtype == ScalarType::Bits16) {
+  } else if (dtype == ScalarType::UInt16) {
     quant_min_lower_bound = std::numeric_limits<uint16_t>::min();
     quant_max_upper_bound = std::numeric_limits<uint16_t>::max();
   } else if (dtype == ScalarType::Short) {
@@ -271,7 +271,7 @@ void quantize_impl(
   case ScalarType::in_dtype:                                         \
     switch (out.scalar_type()) {                                     \
       ET_FORALL_INT_TYPES_WITH(IN_CTYPE, ASYM_QUANTIZE_IMPL_TENSOR); \
-      ASYM_QUANTIZE_IMPL_TENSOR(IN_CTYPE, uint16_t, Bits16)          \
+      ASYM_QUANTIZE_IMPL_TENSOR(IN_CTYPE, uint16_t, UInt16)          \
       default:                                                       \
         ET_CHECK_MSG(                                                \
             false,                                                   \
@@ -343,7 +343,7 @@ void quantize_impl(
   case ScalarType::in_dtype:                                          \
     switch (out.scalar_type()) {                                      \
       ET_FORALL_INT_TYPES_WITH(CTYPE_IN, ASYM_QUANTIZE_IMPL_CHANNEL); \
-      ASYM_QUANTIZE_IMPL_CHANNEL(CTYPE_IN, uint16_t, Bits16)          \
+      ASYM_QUANTIZE_IMPL_CHANNEL(CTYPE_IN, uint16_t, UInt16)          \
       default:                                                        \
         ET_CHECK_MSG(                                                 \
             false,                                                    \
@@ -458,7 +458,7 @@ void quantize_impl(
   case ScalarType::in_dtype:                                        \
     switch (out.scalar_type()) {                                    \
       ET_FORALL_INT_TYPES_WITH(IN_CTYPE, SYM_QUANTIZE_IMPL_TENSOR); \
-      SYM_QUANTIZE_IMPL_TENSOR(IN_CTYPE, uint16_t, Bits16)          \
+      SYM_QUANTIZE_IMPL_TENSOR(IN_CTYPE, uint16_t, UInt16)          \
       default:                                                      \
         ET_CHECK_MSG(                                               \
             false,                                                  \
@@ -529,7 +529,7 @@ void quantize_impl(
   case ScalarType::in_dtype:                                         \
     switch (out.scalar_type()) {                                     \
       ET_FORALL_INT_TYPES_WITH(CTYPE_IN, SYM_QUANTIZE_IMPL_CHANNEL); \
-      SYM_QUANTIZE_IMPL_CHANNEL(CTYPE_IN, uint16_t, Bits16)          \
+      SYM_QUANTIZE_IMPL_CHANNEL(CTYPE_IN, uint16_t, UInt16)          \
       default:                                                       \
         ET_CHECK_MSG(                                                \
             false,                                                   \
@@ -803,4 +803,4 @@ Tensor& quantize_per_token_out(
 } // namespace native
 } // namespace G3
 } // namespace impl
-} // namespace cadence
+} // namespace cadence
diff --git a/backends/cadence/fusion_g3/operators/op_softmax.cpp b/backends/cadence/fusion_g3/operators/op_softmax.cpp
@@ -24,7 +24,7 @@ namespace impl {
 namespace G3 {
 namespace native {
 
-Tensor& softmax_out(
+Tensor& _softmax_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
     int64_t dim,
@@ -112,4 +112,4 @@ Tensor& softmax_out(
 } // namespace native
 } // namespace G3
 } // namespace impl
-} // namespace cadence
+} // namespace cadence
diff --git a/backends/cadence/fusion_g3/operators/targets.bzl b/backends/cadence/fusion_g3/operators/targets.bzl
@@ -0,0 +1,38 @@
+load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    # Define build targets for all operators registered in the tables above.
+
+    runtime.cxx_library(
+        name = "cadence_g3_ops",
+        srcs = glob([
+            "*.cpp",
+        ]),
+        exported_headers = glob([
+            "*.h",
+        ]),
+        platforms = CXX,
+        deps = [
+            "//executorch/kernels/portable/cpu/util:all_deps",
+            "//executorch/kernels/portable/cpu/pattern:all_deps",
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "fbsource//third-party/nnlib-FusionG3/xa_nnlib:libxa_nnlib_common",
+            "fbsource//third-party/nnlib-FusionG3/xa_nnlib:libxa_nnlib",
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        exported_deps = [
+            "fbsource//third-party/nnlib-FusionG3/xa_nnlib:libxa_nnlib_common",
+            "fbsource//third-party/nnlib-FusionG3/xa_nnlib:libxa_nnlib",
+        ],
+    )