pytorch
diff --git a/‎.gitmodules
Lines changed: 1 addition & 1 deletion b/‎.gitmodules
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/functions_fusion_g3.yaml
Lines changed: 20 additions & 6 deletions b/‎backends/cadence/aot/functions_fusion_g3.yaml
Lines changed: 20 additions & 6 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/CMakeLists.txt
Lines changed: 7 additions & 0 deletions b/‎backends/cadence/fusion_g3/operators/CMakeLists.txt
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/op_add.cpp
Lines changed: 4 additions & 2 deletions b/‎backends/cadence/fusion_g3/operators/op_add.cpp
Lines changed: 4 additions & 2 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/op_cat.cpp
Lines changed: 36 additions & 72 deletions b/‎backends/cadence/fusion_g3/operators/op_cat.cpp
Lines changed: 36 additions & 72 deletions
@@ -66,7 +66,7 @@
 	url = https://github.com/pybind/pybind11.git
 [submodule "backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3"]
 	path = backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3
-	url = https://github.com/foss-xtensa/nnlib-FusionG3/
+	url = https://github.com/foss-xtensa/nnlib-FusionG3.git
 [submodule "third-party/ao"]
 	path = third-party/ao
 	url = https://github.com/pytorch/ao.git
@@ -50,12 +50,12 @@
 - op: div.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::div_out
+      kernel_name: cadence::impl::G3::div_out
 
 - op: div.out_mode
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::div_out_mode
+      kernel_name: cadence::impl::G3::div_out_mode
 
 - op: embedding.out
   kernels:
@@ -71,7 +71,6 @@
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::G3::mul_out
-
 - op: mul.Scalar_out
   kernels:
     - arg_meta: null
@@ -80,7 +79,7 @@
 - op: permute_copy.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::permute_copy_out
+      kernel_name: cadence::impl::G3::permute_copy_out
 
 - op: sigmoid.out
   kernels:
@@ -90,7 +89,7 @@
 - op: slice_copy.Tensor_out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::slice_copy_Tensor_out
+      kernel_name: cadence::impl::G3::slice_copy_Tensor_out
 
 - op: split_with_sizes_copy.out
   kernels:
@@ -100,7 +99,12 @@
 - op: sub.out
   kernels:
     - arg_meta: null
-      kernel_name: torch::executor::sub_out
+      kernel_name: cadence::impl::G3::sub_out
+
+- op: sub.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::G3::sub_scalar_out
 
 - op: view_copy.out
   kernels:
@@ -117,6 +121,16 @@
     - arg_meta: null
       kernel_name: cadence::impl::G3::native_layer_norm_out
 
+- op: mean.out
+  kernels:
+    - arg_meta: null
+      kernel_name:  cadence::impl::G3::mean_dim_out
+
+- op: exp.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::G3::exp_out
+
 # custom ops
 - func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
 
@@ -36,6 +36,12 @@ set(_aten_ops__srcs
     "${CMAKE_CURRENT_SOURCE_DIR}/op_native_layer_norm.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/op_quantize.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/op_dequantize.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_sub.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_div.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_mean.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_slice_copy.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_permute_copy.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/op_exp.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_div.cpp"
@@ -51,6 +57,7 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/dtype_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/normalization_ops_util.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp"
 )
 add_library(aten_ops_cadence ${_aten_ops__srcs})
 target_link_libraries(aten_ops_cadence PUBLIC executorch)
 
@@ -39,6 +39,7 @@ Tensor& add_out(
   ScalarType common_type =
       executorch::runtime::promoteTypes(a.scalar_type(), b.scalar_type());
 
+#ifdef OP_ARG_CHECK
   // Check Common Dtype
   ET_KERNEL_CHECK(
       ctx,
@@ -62,12 +63,12 @@ Tensor& add_out(
       torch::executor::resize_to_broadcast_target_size(a, b, out) == Error::Ok,
       InvalidArgument,
       out);
+#endif
 
   // Compute Dtype
   ScalarType compute_type =
       torch::executor::native::utils::get_compute_type(common_type);
 
-  // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "add.out";
 
   int kTensorDimensionLimit = 5;
@@ -253,6 +254,7 @@ Tensor& add_scalar_out(
       torch::executor::native::utils::promote_type_with_scalar(
           a.scalar_type(), b);
 
+#ifdef OP_ARG_CHECK
   // Check Common Dtype
   ET_KERNEL_CHECK(
       ctx,
@@ -276,7 +278,7 @@ Tensor& add_scalar_out(
       executorch::runtime::resize_tensor(out, a.sizes()) == Error::Ok,
       InvalidArgument,
       out);
-
+#endif
   // Compute Dtype
   ScalarType compute_type =
       torch::executor::native::utils::get_compute_type(common_type);
 
@@ -6,13 +6,18 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/fusion_g3/operators/operators.h>
+#include <executorch/backends/cadence/fusion_g3/operators/xt_utils.h>
+
 #include <cstring>
 
 #include <xa_nnlib_kernels_api.h>
 
+#include <executorch/backends/cadence/fusion_g3/operators/xt_macros.h>
 #include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
+using ::executorch::aten::ArrayRef;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::Error;
@@ -23,7 +28,6 @@ using ::executorch::runtime::KernelRuntimeContext;
  * updated to have support for below data types, these can be removed and
  * operator need to be updated accordingly
  */
-enum datatype { Ushort = 20, Uint = 23 };
 
 namespace cadence {
 namespace impl {
@@ -32,20 +36,22 @@ namespace native {
 
 Tensor& cat_out(
     KernelRuntimeContext& ctx,
-    exec_aten::ArrayRef<Tensor> tensors,
+    ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out) {
   if (dim < 0) {
     dim += out.dim();
   }
 
+  int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit;
+
+#ifdef OP_ARG_CHECK
   ET_KERNEL_CHECK(
       ctx,
       torch::executor::check_cat_args(tensors, dim, out),
       InvalidArgument,
       out);
 
-  int kTensorDimensionLimit = executorch::runtime::kTensorDimensionLimit;
   Tensor::SizesType expected_out_size[kTensorDimensionLimit];
   size_t expected_out_dim = 0;
   torch::executor::get_cat_out_target_size(
@@ -57,14 +63,28 @@ Tensor& cat_out(
           out, {expected_out_size, expected_out_dim}) == Error::Ok,
       InvalidArgument,
       out);
+#endif
+  // Special handling when all inputs are 1D-empty tensors for aten
+  // consistency In that case, just return an 1D-empty tensor without checking
+  // dim
+  bool all_1d_empty = true;
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    if (tensors[i].numel() != 0 || tensors[i].dim() != 1) {
+      all_1d_empty = false;
+      break;
+    }
+  }
+  if (all_1d_empty) {
+    return out;
+  }
 
   const signed char* inp_tensors[tensors.size()];
   const int* inp_tensors_shapes[tensors.size()];
 
   int inp_shapes_size[tensors.size()];
 
   int temp_sizes[tensors.size()][kTensorDimensionLimit];
-  exec_aten::ArrayRef<Tensor::SizesType> temp_size;
+  ArrayRef<Tensor::SizesType> temp_size;
 
   for (int i = 0; i < tensors.size(); i++) {
     inp_tensors[i] = tensors[i].const_data_ptr<signed char>();
@@ -79,88 +99,32 @@ Tensor& cat_out(
 
   signed char* out_data = out.mutable_data_ptr<signed char>();
 
-  const exec_aten::ArrayRef<Tensor::SizesType> out_size = out.sizes();
+  const ArrayRef<Tensor::SizesType> out_size = out.sizes();
   int out_shapes[kTensorDimensionLimit];
   for (int i = 0; i < out_size.size(); i++) // output shapes
   {
     out_shapes[i] = out_size[i];
   }
 
-  if (out.scalar_type() == ScalarType::Int) {
-    xa_nn_cat(
-        out_data,
-        out_shapes,
-        inp_tensors,
-        inp_tensors_shapes,
-        inp_shapes_size[0],
-        tensors.size(),
-        (int)dim,
-        sizeof(int));
-  } else if (out.scalar_type() == ScalarType::Short) {
-    xa_nn_cat(
-        out_data,
-        out_shapes,
-        inp_tensors,
-        inp_tensors_shapes,
-        inp_shapes_size[0],
-        tensors.size(),
-        (int)dim,
-        sizeof(short));
-  } else if (out.scalar_type() == ScalarType::Char) {
-    xa_nn_cat(
-        out_data,
-        out_shapes,
-        inp_tensors,
-        inp_tensors_shapes,
-        inp_shapes_size[0],
-        tensors.size(),
-        (int)dim,
-        sizeof(char));
-  } else if (out.scalar_type() == (ScalarType)Uint) {
-    xa_nn_cat(
-        out_data,
-        out_shapes,
-        inp_tensors,
-        inp_tensors_shapes,
-        inp_shapes_size[0],
-        tensors.size(),
-        (int)dim,
-        sizeof(int));
-  } else if (out.scalar_type() == (ScalarType)Ushort) {
-    xa_nn_cat(
+  if ((out.scalar_type() == ScalarType::Int) ||
+      (out.scalar_type() == ScalarType::Short) ||
+      (out.scalar_type() == ScalarType::Char) ||
+      (out.scalar_type() == ScalarType::UInt32) ||
+      (out.scalar_type() == ScalarType::UInt16) ||
+      (out.scalar_type() == ScalarType::Byte)) {
+    XT_KERNEL_CHECK(
+        ctx,
+        out,
+        xa_nn_cat,
         out_data,
         out_shapes,
         inp_tensors,
         inp_tensors_shapes,
         inp_shapes_size[0],
         tensors.size(),
         (int)dim,
-        sizeof(short));
-  } else if (out.scalar_type() == ScalarType::Byte) {
-    xa_nn_cat(
-        out_data,
-        out_shapes,
-        inp_tensors,
-        inp_tensors_shapes,
-        inp_shapes_size[0],
-        tensors.size(),
-        (int)dim,
-        sizeof(char));
-
+        get_element_size(out.scalar_type()));
   } else {
-    // Special handling when all inputs are 1D-empty tensors for aten
-    // consistency In that case, just return an 1D-empty tensor without checking
-    // dim
-    bool all_1d_empty = true;
-    for (size_t i = 0; i < tensors.size(); ++i) {
-      if (tensors[i].numel() != 0 || tensors[i].dim() != 1) {
-        all_1d_empty = false;
-        break;
-      }
-    }
-    if (all_1d_empty) {
-      return out;
-    }
     const size_t outer = executorch::runtime::getLeadingDims(out, dim);
     const size_t dim_stride = executorch::runtime::getTrailingDims(out, dim);
     const size_t ninputs = tensors.size();