pytorch
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/android-perf.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitmodules
Lines changed: 0 additions & 3 deletions b/‎.gitmodules
Lines changed: 0 additions & 3 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 4 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/cadence/CMakeLists.txt
Lines changed: 48 additions & 0 deletions b/‎backends/cadence/CMakeLists.txt
Lines changed: 48 additions & 0 deletions
diff --git a/‎backends/cadence/cadence_runner/build_cadence_runner.sh renamed to ‎backends/cadence/build_cadence_runner.sh
Lines changed: 3 additions & 10 deletions b/‎backends/cadence/cadence_runner/build_cadence_runner.sh renamed to ‎backends/cadence/build_cadence_runner.sh
Lines changed: 3 additions & 10 deletions
diff --git a/‎backends/cadence/cadence_runner/CMakeLists.txt
Lines changed: 0 additions & 74 deletions b/‎backends/cadence/cadence_runner/CMakeLists.txt
Lines changed: 0 additions & 74 deletions
diff --git a/‎backends/cadence/reference/kernels/kernels.cpp
Lines changed: 1 addition & 0 deletions b/‎backends/cadence/reference/kernels/kernels.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/cadence/reference/operators/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/reference/operators/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/reference/operators/quantized_conv_out.cpp
Lines changed: 59 additions & 28 deletions b/‎backends/cadence/reference/operators/quantized_conv_out.cpp
Lines changed: 59 additions & 28 deletions
diff --git a/‎backends/cadence/runtime/executor.py
Lines changed: 1 addition & 3 deletions b/‎backends/cadence/runtime/executor.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎backends/qualcomm/_passes/decompose_einsum.py
Lines changed: 65 additions & 0 deletions b/‎backends/qualcomm/_passes/decompose_einsum.py
Lines changed: 65 additions & 0 deletions
diff --git a/‎backends/qualcomm/passes/expand_broadcast_tensor_shape.py renamed to ‎backends/qualcomm/_passes/expand_broadcast_tensor_shape.py b/‎backends/qualcomm/passes/expand_broadcast_tensor_shape.py renamed to ‎backends/qualcomm/_passes/expand_broadcast_tensor_shape.py
diff --git a/‎backends/qualcomm/_passes/insert_requantize.py
Lines changed: 1 addition & 0 deletions b/‎backends/qualcomm/_passes/insert_requantize.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/qualcomm/_passes/layout_transform.py
Lines changed: 1 addition & 0 deletions b/‎backends/qualcomm/_passes/layout_transform.py
Lines changed: 1 addition & 0 deletions
@@ -135,7 +135,7 @@ jobs:
           delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
       fail-fast: false
     with:
-      runner: linux.2xlarge
+      runner: linux.4xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
       submodules: 'true'
       timeout: 60
 
@@ -28,9 +28,6 @@
 [submodule "backends/xnnpack/third-party/pthreadpool"]
 	path = backends/xnnpack/third-party/pthreadpool
 	url = https://github.com/Maratyszcza/pthreadpool.git
-[submodule "examples/third-party/fbjni"]
-	path = examples/third-party/fbjni
-	url = https://github.com/facebookincubator/fbjni.git
 [submodule "extension/llm/third-party/abseil-cpp"]
 	path = extension/llm/third-party/abseil-cpp
 	url = https://github.com/abseil/abseil-cpp.git
 
@@ -201,6 +201,10 @@ option(EXECUTORCH_BUILD_KERNELS_QUANTIZED "Build the quantized kernels" OFF)
 
 option(EXECUTORCH_BUILD_DEVTOOLS "Build the ExecuTorch Developer Tools")
 
+option(EXECUTORCH_NNLIB_OPT "Build Cadence backend Hifi nnlib kernel" OFF)
+
+option(EXECUTORCH_CADENCE_CPU_RUNNER "Build Cadence backend CPU runner" OFF)
+
 option(EXECUTORCH_BUILD_SIZE_TEST "Build the size test" OFF)
 
 option(EXECUTORCH_BUILD_XNNPACK "Build the XNNPACK backend" OFF)
 
@@ -25,6 +25,54 @@ include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 set(TARGET_DIR reference)
 
+if(EXECUTORCH_CADENCE_CPU_RUNNER)
+  include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+
+  if(NOT PYTHON_EXECUTABLE)
+  resolve_python_executable()
+  endif()
+
+  set(_common_compile_options -Wno-deprecated-declarations -fPIC)
+
+  # Find prebuilt libraries. executorch package should contain portable_ops_lib,
+  # etdump, bundled_program.
+  find_package(executorch CONFIG REQUIRED)
+  target_link_options_shared_lib(executorch)
+  target_link_options_shared_lib(portable_ops_lib)
+
+  target_include_directories(executorch INTERFACE ${_common_include_directories})
+
+  find_package(
+  gflags REQUIRED PATHS ${CMAKE_CURRENT_BINARY_DIR}/../../third-party
+  )
+
+  add_executable(cadence_runner
+      ${EXECUTORCH_ROOT}/examples/devtools/example_runner/example_runner.cpp
+  )
+  target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
+
+  target_include_directories(
+  etdump INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../devtools/include
+                  ${EXECUTORCH_ROOT}/third-party/flatcc/include
+  )
+
+  target_include_directories(
+  cadence_runner PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
+                                      ${_common_include_directories}
+  )
+
+  target_link_libraries(
+  cadence_runner
+  executorch
+  gflags
+  etdump
+  extension_data_loader
+  bundled_program
+  cadence_ops_lib
+  flatccrt
+  )
+endif()
+
 if(EXECUTORCH_NNLIB_OPT)
   set(TARGET_DIR hifi)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib)
 
@@ -12,7 +12,7 @@ set -euo pipefail
 SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 readonly SCRIPT_DIR
 
-readonly EXECUTORCH_ROOT="${SCRIPT_DIR}/../../.."
+readonly EXECUTORCH_ROOT="${SCRIPT_DIR}/../.."
 
 # Allow overriding the number of build jobs. Default to 9.
 export CMAKE_BUILD_PARALLEL_LEVEL="${CMAKE_BUILD_PARALLEL_LEVEL:-9}"
@@ -25,15 +25,7 @@ main() {
     -DCMAKE_BUILD_TYPE=Release \
     -DEXECUTORCH_BUILD_DEVTOOLS=ON \
     -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-    -DPYTHON_EXECUTABLE=python3 \
-    -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
-    -DEXECUTORCH_BUILD_HOST_TARGETS=ON \
-    -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
-    -DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
-    -DEXECUTORCH_BUILD_CPUINFO=OFF \
-    -DEXECUTORCH_ENABLE_LOGGING=ON \
-    -DEXECUTORCH_NNLIB_OPT=OFF \
-    -Bcmake-out
+    -Bcmake-out .
   cmake --build cmake-out --target install --config Release -j16
 
   local example_dir=backends/cadence
@@ -42,6 +34,7 @@ main() {
   rm -rf ${build_dir}
   cmake -DCMAKE_PREFIX_PATH="${cmake_prefix_path}" \
     -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_CADENCE_CPU_RUNNER=ON \
     -B"${build_dir}" \
     "${example_dir}"
   cmake --build "${build_dir}" --config Release -j16
 
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/cadence/reference/kernels/kernels.h>
 #include <math.h>
 #include <algorithm>
 #include <cstring>
 
@@ -33,7 +33,7 @@ set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
-    "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp"
+    "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
     "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
 
@@ -190,34 +190,65 @@ void quantized_conv_out(
   // per-channel
   bool per_tensor_quantized = bias_scale.numel() == 1;
 
-  conv2d_nchw_core_generic<uint8_t, uint8_t, int32_t, uint8_t, true>(
-      input.const_data_ptr<uint8_t>(),
-      weight.const_data_ptr<uint8_t>(),
-      bias.const_data_ptr<int32_t>(),
-      out.mutable_data_ptr<uint8_t>(),
-      n,
-      c,
-      h,
-      w,
-      oc,
-      wc,
-      wh,
-      ww,
-      oh,
-      ow,
-      stride[0],
-      stride[1],
-      padding[0],
-      padding[1],
-      dilation[0],
-      dilation[1],
-      groups,
-      in_zero_point,
-      weight_zero_point.const_data_ptr<int32_t>(),
-      bias_scale.const_data_ptr<float>(),
-      output_scale,
-      (uint8_t)output_zero_point,
-      per_tensor_quantized);
+  if (out.scalar_type() == exec_aten::ScalarType::Byte) {
+    conv2d_nchw_core_generic<uint8_t, uint8_t, int32_t, uint8_t, true>(
+        input.const_data_ptr<uint8_t>(),
+        weight.const_data_ptr<uint8_t>(),
+        bias.const_data_ptr<int32_t>(),
+        out.mutable_data_ptr<uint8_t>(),
+        n,
+        c,
+        h,
+        w,
+        oc,
+        wc,
+        wh,
+        ww,
+        oh,
+        ow,
+        stride[0],
+        stride[1],
+        padding[0],
+        padding[1],
+        dilation[0],
+        dilation[1],
+        groups,
+        in_zero_point,
+        weight_zero_point.const_data_ptr<int32_t>(),
+        bias_scale.const_data_ptr<float>(),
+        output_scale,
+        (uint8_t)output_zero_point,
+        per_tensor_quantized);
+  } else if (out.scalar_type() == exec_aten::ScalarType::Char) {
+    conv2d_nchw_core_generic<int8_t, int8_t, int32_t, int8_t, true>(
+        input.const_data_ptr<int8_t>(),
+        weight.const_data_ptr<int8_t>(),
+        bias.const_data_ptr<int32_t>(),
+        out.mutable_data_ptr<int8_t>(),
+        n,
+        c,
+        h,
+        w,
+        oc,
+        wc,
+        wh,
+        ww,
+        oh,
+        ow,
+        stride[0],
+        stride[1],
+        padding[0],
+        padding[1],
+        dilation[0],
+        dilation[1],
+        groups,
+        in_zero_point,
+        weight_zero_point.const_data_ptr<int32_t>(),
+        bias_scale.const_data_ptr<float>(),
+        output_scale,
+        (int8_t)output_zero_point,
+        per_tensor_quantized);
+  }
 }
 
 }; // namespace native
 
@@ -106,9 +106,7 @@ def __init__(
         working_dir: str = "",
     ):
         self.working_dir = working_dir
-        self.executor_builder = (
-            "./backends/cadence/cadence_runner/build_cadence_runner.sh"
-        )
+        self.executor_builder = "./backends/cadence/build_cadence_runner.sh"
         self.execute_runner = "./cmake-out/backends/cadence/cadence_runner"
         self.bundled_program_path: str = "CadenceDemoModel.bpte"
 
 
@@ -0,0 +1,65 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.fx.experimental.proxy_tensor import make_fx
+
+
+class DecomposeEinsum(ExportPass):
+    """
+    Decompose einsum for quantization annotation to work properly.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.target == torch.ops.aten.einsum.default:
+                decomposed_module = make_fx(
+                    node.target,
+                    tracing_mode="fake",
+                )(node.args[0], [arg.meta["val"] for arg in node.args[1]])
+
+                with graph.inserting_before(node):
+                    # remap is used to map original node values to new node values,
+                    # which ensures that reference to nodes are correclty updated in the new graph
+                    remap = {}
+                    # Different from other nodes, einsum args[0] is the einsum equation,
+                    # while input nodes are stored in args[1]
+                    for i, arg in enumerate(node.args[1]):
+                        remap[f"arg1_{i+1}"] = arg
+
+                    for decomposed_node in decomposed_module.graph.nodes:
+                        # This is the arg[0] equation string, which is not required anymore after decomposition
+                        if "arg0" in decomposed_node.name:
+                            continue
+
+                        # no need to copy existent 'output'
+                        if decomposed_node.op == "output":
+                            for user in node.users.copy():
+                                # remap
+                                user.replace_input_with(
+                                    node,
+                                    remap[decomposed_node.args[0][0]],
+                                )
+                        # no need to copy existent placeholders
+                        elif decomposed_node.op == "placeholder":
+                            # replace node map from string to graph node
+                            remap[decomposed_node] = remap.pop(decomposed_node.name)
+                        else:
+                            remap[decomposed_node] = graph.node_copy(
+                                decomposed_node,
+                                arg_transform=lambda x, remap=remap: remap[x],
+                            )
+
+                    graph.erase_node(node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
@@ -28,6 +28,7 @@ class InsertRequantize(ExportPass):
     # we don't use the 2nd output, 2nd output is an integer, etc.
     multi_output_op_ignore_set = {
         exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
+        exir_ops.edge.aten.topk.default,
     }
 
     def __init__(
 
@@ -65,6 +65,7 @@ class LayoutTransform(ExportPass):
         exir_ops.edge.aten.sqrt.default,
         exir_ops.edge.aten.sub.Tensor,
         exir_ops.edge.aten.sum.dim_IntList,
+        exir_ops.edge.aten.topk.default,
         exir_ops.edge.aten._to_copy.default,
         exir_ops.edge.aten.split_with_sizes.default,
         *q_ops,
Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@ class InsertRequantize(ExportPass):`
`28`	`28`	`# we don't use the 2nd output, 2nd output is an integer, etc.`
`29`	`29`	`multi_output_op_ignore_set = {`
`30`	`30`	`exir_ops.edge.aten._native_batch_norm_legit_no_training.default,`
	`31`	`+ exir_ops.edge.aten.topk.default,`
`31`	`32`	`}`
`32`	`33`
`33`	`34`	`def __init__(`