pytorch
diff --git a/‎CMakeLists.txt
Lines changed: 9 additions & 3 deletions b/‎CMakeLists.txt
Lines changed: 9 additions & 3 deletions
diff --git a/‎Test.cmake
Lines changed: 29 additions & 0 deletions b/‎Test.cmake
Lines changed: 29 additions & 0 deletions
diff --git a/‎backends/arm/test/ops/test_depthwise_conv.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/test/ops/test_depthwise_conv.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/cadence/aot/ops_registrations.py
Lines changed: 31 additions & 2 deletions b/‎backends/cadence/aot/ops_registrations.py
Lines changed: 31 additions & 2 deletions
diff --git a/‎backends/cadence/aot/replace_ops.py
Lines changed: 6 additions & 6 deletions b/‎backends/cadence/aot/replace_ops.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎backends/cadence/build_cadence_fusionG3.sh
Lines changed: 91 additions & 0 deletions b/‎backends/cadence/build_cadence_fusionG3.sh
Lines changed: 91 additions & 0 deletions
diff --git a/‎backends/cadence/build_cadence_xtensa.sh renamed to ‎backends/cadence/build_cadence_hifi4.sh
Lines changed: 4 additions & 2 deletions b/‎backends/cadence/build_cadence_xtensa.sh renamed to ‎backends/cadence/build_cadence_hifi4.sh
Lines changed: 4 additions & 2 deletions
diff --git a/‎backends/cadence/hifi/operators/op_mean.cpp
Lines changed: 1 addition & 2 deletions b/‎backends/cadence/hifi/operators/op_mean.cpp
Lines changed: 1 addition & 2 deletions
diff --git a/‎backends/qualcomm/_passes/remove_redundancy.py
Lines changed: 12 additions & 1 deletion b/‎backends/qualcomm/_passes/remove_redundancy.py
Lines changed: 12 additions & 1 deletion
diff --git a/‎backends/qualcomm/builders/__init__.py
Lines changed: 4 additions & 0 deletions b/‎backends/qualcomm/builders/__init__.py
Lines changed: 4 additions & 0 deletions
@@ -200,8 +200,6 @@ option(EXECUTORCH_BUILD_EXTENSION_TENSOR "Build the Tensor extension" OFF)
 
 option(EXECUTORCH_BUILD_EXTENSION_TRAINING "Build the training extension" OFF)
 
-option(EXECUTORCH_BUILD_GTESTS "Build googletest based test binaries" OFF)
-
 option(EXECUTORCH_BUILD_MPS "Build the MPS backend" OFF)
 
 option(EXECUTORCH_BUILD_NEURON "Build the backends/mediatek directory" OFF)
@@ -216,6 +214,8 @@ option(EXECUTORCH_BUILD_KERNELS_QUANTIZED "Build the quantized kernels" OFF)
 
 option(EXECUTORCH_BUILD_DEVTOOLS "Build the ExecuTorch Developer Tools")
 
+option(EXECUTORCH_BUILD_TESTS "Build CMake-based unit tests" OFF)
+
 option(EXECUTORCH_NNLIB_OPT "Build Cadence backend Hifi nnlib kernel" OFF)
 
 option(EXECUTORCH_CADENCE_CPU_RUNNER "Build Cadence backend CPU runner" OFF)
@@ -330,6 +330,10 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
   )
 endif()
 
+if(EXECUTORCH_BUILD_TESTS)
+  include(CTest)
+endif()
+
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
 endif()
@@ -625,7 +629,7 @@ cmake_dependent_option(
 )
 
 # Add googletest if any test targets should be built
-if(EXECUTORCH_BUILD_GTESTS)
+if(BUILD_TESTING)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/googletest)
 endif()
 
@@ -829,5 +833,7 @@ if(EXECUTORCH_BUILD_VULKAN)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/vulkan)
 endif()
 
+include(Test.cmake)
+
 # Print all summary
 executorch_print_configuration_summary()
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# A helper CMake file to trigger C++ unit tests.
+#
+
+if(BUILD_TESTING)
+  # This contains the list of tests which are always built
+  add_subdirectory(extension/evalue_util/test)
+  add_subdirectory(extension/kernel_util/test)
+  add_subdirectory(extension/memory_allocator/test)
+  add_subdirectory(extension/parallel/test)
+  add_subdirectory(extension/pytree/test)
+  add_subdirectory(kernels/portable/cpu/util/test)
+  add_subdirectory(kernels/prim_ops/test)
+  add_subdirectory(kernels/test)
+  add_subdirectory(runtime/core/exec_aten/testing_util/test)
+  add_subdirectory(runtime/core/exec_aten/util/test)
+  add_subdirectory(runtime/core/portable_type/test)
+  add_subdirectory(runtime/core/test)
+  add_subdirectory(runtime/executor/test)
+  add_subdirectory(runtime/kernel/test)
+  add_subdirectory(runtime/platform/test)
+  add_subdirectory(test/utils)
+endif()
@@ -260,6 +260,7 @@ def test_dw_conv_tosa_BI(self, test_name: str, model: torch.nn.Module):
     )  # Works
 
     @parameterized.expand(testsuite_conv2d, skip_on_empty=True)
+    @unittest.expectedFailure
     def test_dw_conv2d_u55_BI(
         self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
     ):
 
@@ -146,7 +146,10 @@
     "quantized_fully_connected(Tensor src, Tensor weight, Tensor bias, int src_zero_point, "
     "Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset) -> (Tensor Z)"
 )
-
+lib.define(
+    "quantized_fully_connected.per_tensor(Tensor src, Tensor weight, Tensor bias, int src_zero_point, "
+    "int weight_zero_point, int out_multiplier, int out_shift, int out_zero_point, Tensor? offset) -> (Tensor Z)"
+)
 
 # ------------------------------------ #
 #   Migrated from custom_ops.ymal      #
@@ -192,6 +195,10 @@
     "quantized_fully_connected.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, "
     "Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)"
 )
+lib.define(
+    "quantized_fully_connected.per_tensor_out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, "
+    "int weight_zero_point, int out_multiplier, int out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)"
+)
 lib.define(
     "quantized_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor weight_zero_points, "
     "Tensor indices, bool pruned_weights=False, *, Tensor(a!) out) -> Tensor(a!)"
@@ -595,6 +602,28 @@ def quantized_fully_connected_meta(
     bias: torch.Tensor,
     in_zero_point: int,
     weight_zero_point: torch.Tensor,
+    out_multiplier: torch.Tensor,
+    out_shift: torch.Tensor,
+    out_zero_point: int,
+    offset: Optional[torch.Tensor],
+) -> torch.Tensor:
+    # src comes in shape [leading_dims, in_dim]
+    # weight comes in shape [out_dim, in_dim]
+    # output comes in empty with shape [leading_dims, out_dim]
+    out_size = list(src.size())
+    weight_size = list(weight.size())
+    assert len(weight_size) == 2
+    out_size[-1] = weight_size[0]
+    return src.new_empty(out_size, dtype=src.dtype)
+
+
+@register_fake("cadence::quantized_fully_connected.per_tensor")
+def quantized_fully_connected_per_tensor_meta(
+    src: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    in_zero_point: int,
+    weight_zero_point: int,
     out_multiplier: int,
     out_shift: int,
     out_zero_point: int,
@@ -607,7 +636,7 @@ def quantized_fully_connected_meta(
     weight_size = list(weight.size())
     assert len(weight_size) == 2
     out_size[-1] = weight_size[0]
-    return src.new_empty(out_size, dtype=torch.uint8)
+    return src.new_empty(out_size, dtype=src.dtype)
 
 
 @register_fake("cadence::convolution")
 
@@ -9,6 +9,8 @@
 # 3. functions that replace an ATen op with another semantically equivalent ATen op.
 # 4. functions that concretize optional args.
 
+# pyre-unsafe
+
 import math
 from operator import neg
 from typing import cast, Dict, Iterable, Sequence, Set, Tuple
@@ -1698,12 +1700,6 @@ def call_operator(self, op, args, kwargs, meta):
         if leading_dims != 1:
             return super().call_operator(op, args, kwargs, meta)
 
-        # If the op is quantized::linear, but per-channel quantized, bail.
-        if op == exir_ops.edge.cadence.quantized_linear.default:
-            weight = args[1].to_tensor() if isinstance(args[1], ProxyValue) else args[1]
-            if weight.shape != [1]:
-                return super().call_operator(op, args, kwargs, meta)
-
         # Replace the linear with fully connected op
         return super().call_operator(
             self.linear_to_fc_op[op],
@@ -1893,6 +1889,10 @@ class ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass(ExportPass):
             exir_ops.edge.cadence.quantized_conv.per_tensor,
             [8, 9, 12, 13],
         ),
+        exir_ops.edge.cadence.quantized_fully_connected: (
+            exir_ops.edge.cadence.quantized_fully_connected.per_tensor,
+            [4, 5, 6],
+        ),
         exir_ops.edge.cadence.quantized_layer_norm: (
             exir_ops.edge.cadence.quantized_layer_norm.per_tensor,
             [1, 2],
 
@@ -0,0 +1,91 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -euo pipefail
+
+unset CMAKE_PREFIX_PATH
+unset XTENSA_CORE
+export XTENSA_CORE=FCV_FG3GP
+git submodule sync
+git submodule update --init
+./install_requirements.sh
+
+rm -rf cmake-out
+
+STEPWISE_BUILD=false
+
+if $STEPWISE_BUILD; then
+    echo "Building ExecuTorch"
+    cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake  \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_ENABLE_EVENT_TRACER=OFF \
+        -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
+        -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
+        -DEXECUTORCH_BUILD_CPUINFO=OFF \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
+        -DEXECUTORCH_USE_DL=OFF \
+        -DEXECUTORCH_BUILD_CADENCE=OFF \
+        -DFLATC_EXECUTABLE="$(which flatc)" \
+        -DHAVE_FNMATCH_H=OFF \
+        -Bcmake-out .
+
+    echo "Building any Cadence-specific binaries on top"
+    cmake -DBUCK2="$BUCK" \
+        -DCMAKE_TOOLCHAIN_FILE=/home/zonglinpeng/ws/zonglinpeng/executorch/backends/cadence/cadence.cmake \
+        -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_HOST_TARGETS=ON \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
+        -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
+        -DEXECUTORCH_BUILD_CADENCE=ON \
+        -DFLATC_EXECUTABLE="$(which flatc)" \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
+        -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
+        -DEXECUTORCH_USE_DL=OFF \
+        -DBUILD_EXECUTORCH_PORTABLE_OPS=ON \
+        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \
+        -DPYTHON_EXECUTABLE=python3 \
+        -DEXECUTORCH_FUSION_G3_OPT=ON \
+        -DEXECUTORCH_BUILD_GFLAGS=ON \
+        -DHAVE_FNMATCH_H=OFF \
+        -Bcmake-out/backends/cadence \
+        backends/cadence
+    cmake --build cmake-out/backends/cadence  -j8
+else
+    echo "Building Cadence toolchain with ExecuTorch packages"
+    cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
+    cmake -DBUCK2="$BUCK" \
+        -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \
+        -DHAVE_SYS_STAT_H=ON \
+        -DCMAKE_TOOLCHAIN_FILE=./backends/cadence/cadence.cmake \
+        -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_HOST_TARGETS=ON \
+        -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
+        -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
+        -DEXECUTORCH_BUILD_CPUINFO=OFF \
+        -DEXECUTORCH_BUILD_FLATC=OFF \
+        -DEXECUTORCH_BUILD_CADENCE=ON \
+        -DFLATC_EXECUTABLE="$(which flatc)" \
+        -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
+        -DEXECUTORCH_ENABLE_LOGGING=ON \
+        -DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
+        -DEXECUTORCH_USE_DL=OFF \
+        -DBUILD_EXECUTORCH_PORTABLE_OPS=ON \
+        -DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \
+        -DPYTHON_EXECUTABLE=python3 \
+        -DEXECUTORCH_FUSION_G3_OPT=ON \
+        -DHAVE_FNMATCH_H=OFF \
+        -Bcmake-out
+    cmake --build cmake-out --target install --config Release -j8
+fi
+
+echo "Run simple model to verify cmake build"
+python3 -m examples.portable.scripts.export --model_name="add"
+xt-run --turbo cmake-out/executor_runner  --model_path=add.pte
@@ -8,6 +8,8 @@
 set -euo pipefail
 
 unset CMAKE_PREFIX_PATH
+unset XTENSA_CORE
+export XTENSA_CORE=nxp_rt600_RI23_11_newlib
 git submodule sync
 git submodule update --init
 ./install_requirements.sh
@@ -53,7 +55,7 @@ if $STEPWISE_BUILD; then
         -DHAVE_FNMATCH_H=OFF \
         -Bcmake-out/backends/cadence \
         backends/cadence
-    cmake --build cmake-out/backends/cadence  -j16
+    cmake --build cmake-out/backends/cadence  -j8
 else
     echo "Building Cadence toolchain with ExecuTorch packages"
     cmake_prefix_path="${PWD}/cmake-out/lib/cmake/ExecuTorch;${PWD}/cmake-out/third-party/gflags"
@@ -79,7 +81,7 @@ else
         -DEXECUTORCH_NNLIB_OPT=ON \
         -DHAVE_FNMATCH_H=OFF \
         -Bcmake-out
-    cmake --build cmake-out --target install --config Release -j16
+    cmake --build cmake-out --target install --config Release -j8
 fi
 
 echo "Run simple model to verify cmake build"
 
@@ -145,8 +145,7 @@ Tensor& mean_dim_out(
   ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, "mean.out", CTYPE_IN, [&] {
     ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, "mean.out", CTYPE_OUT, [&] {
       CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-      const size_t num =
-          torch::executor::exeget_reduced_dim_product(in, dim_list);
+      const size_t num = torch::executor::get_reduced_dim_product(in, dim_list);
       for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
         CTYPE_OUT sum = 0;
         if (in.numel() > 0) {
 
@@ -11,7 +11,7 @@
 
 class RemoveRedundancy(ExportPass):
     """
-    Trim the 'identity' operators to reduce the unnecessary copy overhead.
+    Trim certain operators to reduce unnecessary overhead.
     """
 
     redundant_ops = {
@@ -21,6 +21,10 @@ class RemoveRedundancy(ExportPass):
         torch.ops.aten.alias.default,
         exir_ops.edge.aten.alias.default,
         exir_ops.edge.aten.lift_fresh_copy.default,
+        # remove this target if '_skip_dim_order' is set to False
+        exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
+        # remove channel_last / contiguous _to_copy if '_skip_dim_order' is set to True
+        exir_ops.edge.aten._to_copy.default,
     }
 
     def __init__(self):
@@ -31,6 +35,13 @@ def _remove(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule:
             if n.target not in self.redundant_ops:
                 continue
 
+            # do not remove cast operator
+            if (
+                n.target == exir_ops.edge.aten._to_copy.default
+                and "memory_format" not in n.kwargs
+            ):
+                continue
+
             to_be_remove = n
             for user_n in list(n.users.keys()):
                 user_n.replace_input_with(n, n.args[0])
 
@@ -14,6 +14,7 @@
     op_ceil,
     op_clamp,
     op_conv2d,
+    op_cos,
     op_depth_to_space,
     op_dequantize,
     op_div,
@@ -43,6 +44,7 @@
     op_rsqrt,
     op_select_copy,
     op_sigmoid,
+    op_sin,
     op_skip_ops,
     op_slice_copy,
     op_softmax,
@@ -71,6 +73,7 @@
     op_ceil,
     op_clamp,
     op_conv2d,
+    op_cos,
     op_depth_to_space,
     op_dequantize,
     op_div,
@@ -100,6 +103,7 @@
     op_rsqrt,
     op_select_copy,
     op_sigmoid,
+    op_sin,
     op_skip_ops,
     op_slice_copy,
     op_softmax,