pytorch
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 2 additions & 0 deletions b/‎.ci/scripts/test_llama.sh
Lines changed: 2 additions & 0 deletions
diff --git a/‎.ci/scripts/test_quantized_aot_lib.sh
Lines changed: 33 additions & 0 deletions b/‎.ci/scripts/test_quantized_aot_lib.sh
Lines changed: 33 additions & 0 deletions
diff --git a/‎.github/workflows/apple.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/apple.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/pull.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pull.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 22 additions & 11 deletions b/‎CMakeLists.txt
Lines changed: 22 additions & 11 deletions
diff --git a/‎backends/arm/third-party/serialization_lib b/‎backends/arm/third-party/serialization_lib
diff --git a/‎backends/qualcomm/CMakeLists.txt
Lines changed: 28 additions & 13 deletions b/‎backends/qualcomm/CMakeLists.txt
Lines changed: 28 additions & 13 deletions
diff --git a/‎backends/qualcomm/aot/wrappers/TensorWrapper.cpp
Lines changed: 7 additions & 0 deletions b/‎backends/qualcomm/aot/wrappers/TensorWrapper.cpp
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/qualcomm/aot/wrappers/TensorWrapper.h
Lines changed: 24 additions & 2 deletions b/‎backends/qualcomm/aot/wrappers/TensorWrapper.h
Lines changed: 24 additions & 2 deletions
diff --git a/‎backends/qualcomm/passes/insert_io_qdq.py
Lines changed: 6 additions & 0 deletions b/‎backends/qualcomm/passes/insert_io_qdq.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/qualcomm/runtime/CMakeLists.txt
Lines changed: 7 additions & 0 deletions b/‎backends/qualcomm/runtime/CMakeLists.txt
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/qualcomm/runtime/QnnExecuTorch.h
Lines changed: 12 additions & 0 deletions b/‎backends/qualcomm/runtime/QnnExecuTorch.h
Lines changed: 12 additions & 0 deletions
diff --git a/‎backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
Lines changed: 13 additions & 5 deletions b/‎backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
Lines changed: 13 additions & 5 deletions
@@ -47,6 +47,7 @@ cmake_install_executorch_libraries() {
         -DCMAKE_BUILD_TYPE=Release \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_OPTIMIZED=ON \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out .
     cmake --build cmake-out -j9 --target install --config Release
@@ -58,6 +59,7 @@ cmake_build_llama_runner() {
     retry cmake -DBUCK2="$BUCK" \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_OPTIMIZED=ON \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out/${dir} \
         ${dir}
 
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+# shellcheck source=/dev/null
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+which "${PYTHON_EXECUTABLE}"
+# Just set this variable here, it's cheap even if we use buck2
+CMAKE_OUTPUT_DIR=cmake-out
+
+build_cmake_quantized_aot_lib() {
+  echo "Building quantized aot lib"
+  SITE_PACKAGES="$(${PYTHON_EXECUTABLE} -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+  CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch"
+  (rm -rf ${CMAKE_OUTPUT_DIR} \
+    && mkdir ${CMAKE_OUTPUT_DIR} \
+    && cd ${CMAKE_OUTPUT_DIR} \
+    && retry cmake -DBUCK2=buck2 \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
+      -DEXECUTORCH_BUILD_QUANTIZED=ON \
+      -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
+
+  cmake --build ${CMAKE_OUTPUT_DIR} -j4
+}
+
+build_cmake_quantized_aot_lib
@@ -90,7 +90,7 @@ jobs:
 
         # Build iOS Frameworks
         PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-        build/build_apple_frameworks.sh --coreml --mps --portable --quantized --xnnpack
+        build/build_apple_frameworks.sh --coreml --mps --optimized --portable --quantized --xnnpack
 
         # Bundle iOS Frameworks
         for FRAMEWORK in "${FRAMEWORKS[@]}"; do (
 
@@ -88,7 +88,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     strategy:
       matrix:
-        dtype: [fp16, fp32]
+        dtype: [fp32]
         build-tool: [buck2, cmake]
       fail-fast: false
     with:
 
@@ -140,7 +140,7 @@ option(EXECUTORCH_BUILD_ARM_BAREMETAL
 
 option(EXECUTORCH_BUILD_COREML "Build the Core ML backend" OFF)
 
-option(EXECUTORCH_BUILD_EXTENSION_AOT_UTIL "Build the AOT Util extension" OFF)
+option(EXECUTORCH_BUILD_EXTENSION_AOT_UTIL "Build the AOT util library" OFF)
 
 option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "Build the Data Loader extension"
        OFF)
@@ -158,7 +158,9 @@ option(EXECUTORCH_BUILD_PYBIND "Build the Python Bindings" OFF)
 
 option(EXECUTORCH_BUILD_QNN "Build the Qualcomm backend" OFF)
 
-option(REGISTER_QUANTIZED_OPS "Build the quantized kernels" OFF)
+option(EXECUTORCH_BUILD_OPTIMIZED "Build the optimized kernels" OFF)
+
+option(EXECUTORCH_BUILD_QUANTIZED "Build the quantized kernels" OFF)
 
 option(EXECUTORCH_BUILD_SDK "Build the ExecuTorch SDK")
 
@@ -313,14 +315,17 @@ endif()
 # operators necessary for the models that will run.
 #
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
 
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations)
+if(EXECUTORCH_BUILD_OPTIMIZED)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
+endif()
 
-if(REGISTER_QUANTIZED_OPS)
+if(EXECUTORCH_BUILD_QUANTIZED)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized)
 endif()
 
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations)
+
 #
 # gflags: Commandline flag host library.
 #
@@ -347,10 +352,16 @@ cmake_dependent_option(
   EXECUTORCH_BUILD_HOST_TARGETS OFF)
 if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
   # Baseline libraries that executor_runner will link against.
-  set(_executor_runner_libs executorch optimized_native_cpu_ops_lib gflags)
+  set(_executor_runner_libs executorch gflags)
+
+  if(EXECUTORCH_BUILD_OPTIMIZED)
+    list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
+  else()
+    list(APPEND _executor_runner_libs portable_ops_lib)
+  endif()
 
   # Generate lib to register quantized ops
-  if(REGISTER_QUANTIZED_OPS)
+  if(EXECUTORCH_BUILD_QUANTIZED)
     list(APPEND _executor_runner_libs quantized_ops_lib)
   endif()
 
@@ -362,6 +373,10 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
   target_compile_options(executor_runner PUBLIC ${_common_compile_options})
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_AOT_UTIL)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/aot_util)
+ endif()
+
 # Add googletest if any test targets should be built
 if(EXECUTORCH_BUILD_GTESTS)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/googletest)
@@ -390,10 +405,6 @@ if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_AOT_UTIL)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/aot_util)
-endif()
-
 if(EXECUTORCH_BUILD_XNNPACK)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack)
 endif()
 
@@ -118,27 +118,29 @@ include_directories(
 #
 # declare targets
 #
+add_library(executorch_backend INTERFACE)
 add_library(qcir INTERFACE qcir_schema_output)
 add_library(qcir_utils STATIC)
-add_library(qnn_schema INTERFACE ${_qnn_schema__outputs})
-add_library(executorch_backend INTERFACE)
+add_library(qnn_backend STATIC)
+add_library(qnn_backend_cache STATIC)
+add_library(qnn_context STATIC)
+add_library(qnn_device STATIC)
 add_library(qnn_executorch_backend SHARED)
 add_library(qnn_executorch_header INTERFACE)
 add_library(qnn_executorch_logging STATIC)
-add_library(qnn_manager STATIC)
+add_library(qnn_factory STATIC)
 add_library(qnn_function_interface INTERFACE)
+add_library(qnn_graph STATIC)
+add_library(qnn_header INTERFACE)
 add_library(qnn_implementation STATIC)
-add_library(qnn_sys_function_interface INTERFACE)
-add_library(qnn_sys_implementation STATIC)
 add_library(qnn_logger STATIC)
+add_library(qnn_manager STATIC)
+add_library(qnn_mem_manager STATIC)
 add_library(qnn_profiler STATIC)
-add_library(qnn_device STATIC)
-add_library(qnn_context STATIC)
-add_library(qnn_backend_cache STATIC)
-add_library(qnn_graph STATIC)
-add_library(qnn_backend STATIC)
-add_library(qnn_factory STATIC)
-add_library(qnn_header INTERFACE)
+add_library(qnn_schema INTERFACE ${_qnn_schema__outputs})
+add_library(qnn_sys_function_interface INTERFACE)
+add_library(qnn_sys_implementation STATIC)
+add_library(shared_buffer STATIC)
 add_library(wrappers STATIC)
 add_library(utils STATIC)
 
@@ -220,6 +222,13 @@ target_link_libraries(qnn_graph
     qnn_context
     qnn_profiler
 )
+target_link_libraries(qnn_mem_manager
+    PRIVATE
+    qnn_executorch_logging
+    qnn_implementation
+    qnn_context
+)
+
 target_link_libraries(qnn_factory
     PUBLIC
     qnn_header
@@ -229,13 +238,15 @@ target_link_libraries(qnn_factory
     qnn_device
     qnn_context
     qnn_graph
+    qnn_mem_manager
 )
 target_link_libraries(qnn_manager
     PRIVATE
     qnn_factory
     wrappers
     qnn_schema
     utils
+    shared_buffer
 )
 target_link_libraries(qnn_executorch_backend
     PRIVATE
@@ -249,7 +260,11 @@ target_link_libraries(utils
     PRIVATE
     qnn_executorch_logging
 )
-
+target_link_libraries(shared_buffer
+    PRIVATE
+    qnn_executorch_logging
+    ${CMAKE_DL_LIBS}
+)
 #
 # add linker option
 #
 
@@ -105,6 +105,7 @@ TensorWrapper::TensorWrapper(
 
 Error TensorWrapper::FillDataBuffer(const void* data, bool copy_data) {
   if (data != nullptr) {
+    QNN_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_RAW;
     QNN_VER_PTR(tensor_)->clientBuf.dataSize = bytes_;
     if (copy_data) {
       owned_data_ = std::make_unique<char[]>(bytes_);
@@ -144,6 +145,12 @@ Error TensorWrapper::SetName(const std::string& name) {
   return Error::Ok;
 }
 
+Error TensorWrapper::SetMemHandle(Qnn_MemHandle_t mem_handle) {
+  QNN_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+  QNN_VER_PTR(tensor_)->memHandle = mem_handle;
+  return Error::Ok;
+}
+
 // base function for Create TensorWrapper
 std::shared_ptr<TensorWrapper> CreateTensorWrapper(
     const std::string& tensor_name,
 
@@ -59,16 +59,38 @@ class TensorWrapper {
     return QNN_VER_PTR(tensor_)->type == QNN_TENSOR_TYPE_STATIC;
   };
 
-  const void* GetStaticTensorData() const {
-    return QNN_VER_PTR(tensor_)->clientBuf.data;
+  std::uint32_t* GetDims() const {
+    return QNN_VER_PTR(tensor_)->dimensions;
+  };
+
+  Qnn_DataType_t GetDataType() const {
+    return QNN_VER_PTR(tensor_)->dataType;
+  };
+
+  Qnn_MemHandle_t const GetMemHandle() {
+    return QNN_VER_PTR(tensor_)->memHandle;
+  };
+
+  Qnn_TensorMemType_t GetMemType() const {
+    return QNN_VER_PTR(tensor_)->memType;
   };
 
   std::string GetName() const {
     return qnn_tensor_name_;
   };
 
+  std::uint32_t GetRank() const {
+    return QNN_VER_PTR(tensor_)->rank;
+  };
+
+  const void* GetStaticTensorData() const {
+    return QNN_VER_PTR(tensor_)->clientBuf.data;
+  };
+
   Error SetName(const std::string& name);
 
+  Error SetMemHandle(Qnn_MemHandle_t mem_handle);
+
  private:
   // need this to handle QNN_TENSOR_ERROR_NAME_HASH_COLLISION
   std::string qnn_tensor_name_;
 
@@ -38,6 +38,12 @@ def _ceate_args(self, target: torch.fx.node.Target, quant_attrs: Dict):
         arg_schemas = list(target._schema.arguments)[1:]
         for arg_schema in arg_schemas:
             name = arg_schema.name
+            # TODO: Due to the new parameter "out_dtype" in the dequantize node,
+            # it could not be found in the quant_attrs of other nodes,
+            # and it will cause a key error. For now, the output type
+            # of our dequantize node is only float. (by default in pytorch)
+            if name == "out_dtype":
+                continue
             value = quant_attrs[name]
             if type(arg_schema.type) == torch.tensor and type(value) in [int, float]:
                 value = torch.tensor(value)
 
@@ -47,3 +47,10 @@ target_sources(utils
     PRIVATE
     ${CMAKE_CURRENT_LIST_DIR}/Utils.cpp
 )
+
+# shared_buffer
+target_sources(shared_buffer
+    PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}/SharedBuffer.h
+    ${CMAKE_CURRENT_LIST_DIR}/SharedBuffer.cpp
+)
@@ -8,8 +8,10 @@
 #pragma once
 
 #ifdef __cplusplus
+#include <cstddef>
 #include <cstdint>
 #else
+#include <stddef.h>
 #include <stdint.h>
 #endif
 
@@ -31,6 +33,16 @@ typedef struct {
   }
 // clang-format on
 
+/// Allocate specific tensors (usually graph inputs and outputs) on shared
+/// memory. Users are responsible to allocate "enough" tensor bytes, and set
+/// alignment as MemoryAllocator::kDefaultAlignment.
+/// See runtime/core/memory_allocator.h. The function returns a valid pointer
+/// if allocation is successful.
+void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment);
+
+/// Free the allocated shared memory.
+void QnnExecuTorchFreeCustomMem(void* buffer_ptr);
+
 #ifdef __cplusplus
 }
 #endif // __cplusplus
@@ -188,19 +188,27 @@ Error QnnExecuTorchBackend::execute(
   std::vector<Qnn_Tensor_t> input_tensor_structs;
   std::vector<Qnn_Tensor_t> output_tensor_structs;
 
+  input_tensor_structs.reserve(input_tensors.size());
   for (int i = 0; i < input_tensors.size(); ++i) {
-    input_tensors[i]->FillDataBuffer(
-        args[i]->toTensor().const_data_ptr(), true /* copy_data */);
+    if (qnn_manager->RegisterMem(
+            args[i]->toTensor().mutable_data_ptr(), input_tensors[i]) !=
+        Error::Ok) {
+      input_tensors[i]->FillDataBuffer(
+          args[i]->toTensor().const_data_ptr(), true /* copy_data */);
+    }
     input_tensor_structs.push_back(input_tensors[i]->CloneTensorStruct());
   }
 
   int output_index = input_tensors.size();
   for (const auto& output_tensor : output_tensors) {
     // pos=0 limits the search to the prefix
     if (output_tensor->GetName().rfind("output_", 0) == 0) {
-      output_tensor->FillDataBuffer(
-          args[output_index]->toTensor().mutable_data_ptr(),
-          false /* copy_data */);
+      void* mutable_data_ptr =
+          args[output_index]->toTensor().mutable_data_ptr();
+      if (qnn_manager->RegisterMem(mutable_data_ptr, output_tensor) !=
+          Error::Ok) {
+        output_tensor->FillDataBuffer(mutable_data_ptr, false /* copy_data */);
+      }
       output_index++;
     }
     output_tensor_structs.push_back(output_tensor->CloneTensorStruct());