pytorch
diff --git a/‎.github/workflows/_unittest.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/_unittest.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/pull.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 3 additions & 1 deletion b/‎CMakeLists.txt
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/vulkan/runtime/api/Tensor.cpp
Lines changed: 0 additions & 39 deletions b/‎backends/vulkan/runtime/api/Tensor.cpp
Lines changed: 0 additions & 39 deletions
diff --git a/‎backends/vulkan/runtime/api/Tensor.h
Lines changed: 0 additions & 24 deletions b/‎backends/vulkan/runtime/api/Tensor.h
Lines changed: 0 additions & 24 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/PrepackNode.cpp
Lines changed: 1 addition & 3 deletions b/‎backends/vulkan/runtime/graph/ops/PrepackNode.cpp
Lines changed: 1 addition & 3 deletions
diff --git a/‎backends/vulkan/test/utils/test_utils.cpp
Lines changed: 2 additions & 56 deletions b/‎backends/vulkan/test/utils/test_utils.cpp
Lines changed: 2 additions & 56 deletions
diff --git a/‎backends/xnnpack/CMakeLists.txt
Lines changed: 3 additions & 4 deletions b/‎backends/xnnpack/CMakeLists.txt
Lines changed: 3 additions & 4 deletions
diff --git a/‎backends/xnnpack/cmake/Dependencies.cmake
Lines changed: 10 additions & 0 deletions b/‎backends/xnnpack/cmake/Dependencies.cmake
Lines changed: 10 additions & 0 deletions
diff --git a/‎examples/apple/mps/scripts/mps_example.py
Lines changed: 10 additions & 12 deletions b/‎examples/apple/mps/scripts/mps_example.py
Lines changed: 10 additions & 12 deletions
diff --git a/‎examples/models/llama2/CMakeLists.txt
Lines changed: 6 additions & 0 deletions b/‎examples/models/llama2/CMakeLists.txt
Lines changed: 6 additions & 0 deletions
@@ -37,6 +37,7 @@ jobs:
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
         PYTHON_EXECUTABLE=python \
         EXECUTORCH_BUILD_PYBIND=ON \
+        EXECUTORCH_BUILD_XNNPACK=ON \
         .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
 
         # Run pytest with coverage
 
@@ -232,6 +232,7 @@ jobs:
         # build module for executorch.extension.pybindings.portable_lib
         BUILD_TOOL=${{ matrix.build-tool }}
         PYTHON_EXECUTABLE=python \
+        EXECUTORCH_BUILD_XNNPACK=ON \
         EXECUTORCH_BUILD_PYBIND=ON \
         bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
 
 
@@ -508,7 +508,9 @@ if(EXECUTORCH_BUILD_PYBIND)
   endif()
 
   if(EXECUTORCH_BUILD_XNNPACK)
-    set(PYBIND_LINK_XNNPACK "xnnpack_backend")
+    # need to explicitly specify XNNPACK here
+    # otherwise uses XNNPACK symbols from libtorch_cpu
+    set(PYBIND_LINK_XNNPACK xnnpack_backend XNNPACK)
   endif()
 
   # find pytorch lib, to allow pybind to take at::Tensor as input/output
 
@@ -197,25 +197,6 @@ api::utils::uvec3 create_image_extents(
   }
 }
 
-api::UniformParamsBuffer make_metadata_uniform(
-    api::Context* const context,
-    const std::vector<int64_t>& sizes,
-    const std::vector<int64_t>& strides,
-    const api::StorageType storage_type) {
-  if (storage_type != api::StorageType::BUFFER) {
-    return api::UniformParamsBuffer();
-  }
-
-  vTensor::BufferMetadata metadata{
-      api::utils::make_whcn_uvec4(sizes),
-      api::utils::make_whcn_uvec4(strides),
-      api::utils::safe_downcast<uint32_t>(sizes.size()),
-      api::utils::safe_downcast<uint32_t>(api::utils::multiply_integers(sizes)),
-  };
-
-  return api::UniformParamsBuffer(context, metadata);
-}
-
 } // namespace
 
 //
@@ -239,7 +220,6 @@ vTensor::vTensor(
       virtual_extents_(
           create_image_extents(gpu_sizes_, storage_type, memory_layout)),
       // Utility Uniform Buffers that can be passed to shaders as arguments
-      metadata_uniform_(),
       cpu_sizes_uniform_(nullptr),
       gpu_sizes_uniform_(nullptr),
       extents_uniform_(nullptr),
@@ -270,7 +250,6 @@ vTensor::vTensor(
       virtual_extents_(
           create_image_extents(gpu_sizes_, storage_type, memory_layout)),
       // Vulkan uniform buffer containing sizes and stride info
-      metadata_uniform_(),
       cpu_sizes_uniform_(nullptr),
       gpu_sizes_uniform_(nullptr),
       extents_uniform_(nullptr),
@@ -316,14 +295,6 @@ api::VulkanBuffer& vTensor::buffer(
   return view_->buffer_;
 }
 
-api::VulkanBuffer& vTensor::buffer_metadata() {
-  if (!metadata_uniform_.buffer()) {
-    metadata_uniform_ = make_metadata_uniform(
-        view_->context_, gpu_sizes_, gpu_strides_, storage_type());
-  }
-  return metadata_uniform_.buffer();
-}
-
 std::shared_ptr<api::UniformParamsBuffer> vTensor::cpu_sizes_ubo() {
   if (!cpu_sizes_uniform_) {
     cpu_sizes_uniform_.reset(new api::UniformParamsBuffer(
@@ -353,16 +324,6 @@ std::shared_ptr<api::UniformParamsBuffer> vTensor::extents_ubo() {
   return extents_uniform_;
 }
 
-vTensor::BufferMetadata vTensor::get_cpu_buffer_metadata() const {
-  return {
-      api::utils::make_whcn_uvec4(sizes_),
-      api::utils::make_whcn_uvec4(strides_),
-      api::utils::safe_downcast<uint32_t>(sizes_.size()),
-      api::utils::safe_downcast<uint32_t>(
-          api::utils::multiply_integers(sizes_)),
-  };
-}
-
 VmaAllocationCreateInfo vTensor::get_allocation_create_info() const {
   switch (storage_type()) {
     case api::StorageType::BUFFER:
 
@@ -129,14 +129,6 @@ class vTensor final {
   vTensor(vTensor&& other) = default;
   vTensor& operator=(vTensor&& other) = default;
 
-  // Used for passing buffer sizes and strides data to shaders
-  struct BufferMetadata {
-    api::utils::uvec4 sizes;
-    api::utils::uvec4 strides;
-    uint32_t ndim;
-    uint32_t buffer_length;
-  };
-
  private:
   // Tensor Options
   api::ScalarType dtype_;
@@ -159,10 +151,6 @@ class vTensor final {
   // to be interpreted as a tensor with a different size.
   api::utils::uvec3 virtual_extents_;
 
-  // A Vulkan uniform buffer containing sizes and strides of the GPU buffer that
-  // can be passed into a shader.
-  api::UniformParamsBuffer metadata_uniform_;
-
   // A Vulkan uniform buffer containing the tensor sizes that can be passed into
   // a shader.
   std::shared_ptr<api::UniformParamsBuffer> cpu_sizes_uniform_;
@@ -285,12 +273,6 @@ class vTensor final {
     return virtual_extents_;
   }
 
-  /*
-   * Get a uniform buffer containing sizes and strides information of the GPU
-   * buffer
-   */
-  api::VulkanBuffer& buffer_metadata();
-
   /*
    * Get a uniform buffer object containing the tensor sizes to use in a compute
    * shader. Note that the UBO will be created the first time this function is
@@ -312,12 +294,6 @@ class vTensor final {
    */
   std::shared_ptr<api::UniformParamsBuffer> extents_ubo();
 
-  /*
-   * Constructs a BufferMetdata struct based on the original sizes and strides
-   * to pass into a shader.
-   */
-  BufferMetadata get_cpu_buffer_metadata() const;
-
   inline void set_is_quantized() {
     is_quantized_ = true;
   }
 
@@ -39,10 +39,8 @@ void PrepackNode::encode(ComputeGraph* graph) {
   TensorRef tref = graph->get_val(tref_).toTensorRef();
   vTensor packed = graph->get_val(packed_).toTensor();
 
-  // TODO: Extract to standalone function, to support other types of prepacking.
-  api::StorageBuffer staging(
-      graph->context(), packed.dtype(), packed.gpu_nbytes());
   size_t numel = api::utils::multiply_integers(tref.sizes);
+  api::StorageBuffer staging(graph->context(), tref.dtype, numel);
   size_t nbytes = numel * api::element_size(tref.dtype);
   copy_ptr_to_staging(tref.data, staging, nbytes);
 
 
@@ -16,60 +16,6 @@
 // Operator Recording Functions
 //
 
-void record_nchw_to_buffer_op(
-    api::Context* const context,
-    api::VulkanBuffer& src_buffer,
-    vTensor& v_dst) {
-  uint32_t buf_len = api::utils::safe_downcast<uint32_t>(v_dst.gpu_numel());
-  api::utils::uvec3 global_size = {buf_len, 1u, 1u};
-  api::utils::uvec3 local_size = {32u, 1u, 1u};
-
-  api::UniformParamsBuffer cpu_buffer_metadata(
-      context, v_dst.get_cpu_buffer_metadata());
-  api::PipelineBarrier pipeline_barrier{};
-
-  context->submit_compute_job(
-      VK_KERNEL(buffer_to_buffer),
-      pipeline_barrier,
-      global_size,
-      local_size,
-      VK_NULL_HANDLE,
-      v_dst.buffer(
-          pipeline_barrier,
-          api::PipelineStage::COMPUTE,
-          api::MemoryAccessType::WRITE),
-      v_dst.buffer_metadata(),
-      src_buffer,
-      cpu_buffer_metadata.buffer());
-}
-
-bool record_buffer_to_nchw_op(
-    api::Context* const context,
-    vTensor& v_src,
-    api::VulkanBuffer& dst_buffer) {
-  uint32_t buf_len = api::utils::safe_downcast<uint32_t>(v_src.numel());
-  api::utils::uvec3 global_size = {buf_len, 1u, 1u};
-  api::utils::uvec3 local_size = {4u, 1u, 1u};
-
-  api::UniformParamsBuffer cpu_buffer_metadata(
-      context, v_src.get_cpu_buffer_metadata());
-  api::PipelineBarrier pipeline_barrier{};
-
-  return context->submit_compute_job(
-      VK_KERNEL(buffer_to_buffer),
-      pipeline_barrier,
-      global_size,
-      local_size,
-      VK_NULL_HANDLE,
-      dst_buffer,
-      cpu_buffer_metadata.buffer(),
-      v_src.buffer(
-          pipeline_barrier,
-          api::PipelineStage::COMPUTE,
-          api::MemoryAccessType::WRITE),
-      v_src.buffer_metadata());
-}
-
 void record_nchw_to_image_op(
     api::Context* const context,
     api::VulkanBuffer& src_buffer,
@@ -166,7 +112,7 @@ void fill_vtensor(vTensor& vten, std::vector<float>& data) {
   copy_ptr_to_staging(data.data(), staging_buffer, vten.gpu_nbytes());
 
   if (vten.storage_type() == api::StorageType::BUFFER) {
-    record_nchw_to_buffer_op(api::context(), staging_buffer.buffer(), vten);
+    VK_THROW("Not supported!");
   } else {
     record_nchw_to_image_op(api::context(), staging_buffer.buffer(), vten);
   }
@@ -192,7 +138,7 @@ void extract_vtensor(vTensor& vten, std::vector<float>& data) {
       api::context(), api::kFloat, vten.gpu_numel());
 
   if (vten.storage_type() == api::StorageType::BUFFER) {
-    record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer());
+    VK_THROW("Not supported!");
   } else {
     record_image_to_nchw_op(api::context(), vten, staging_buffer.buffer());
   }
 
@@ -33,7 +33,7 @@ if(NOT PYTHON_EXECUTABLE)
 endif()
 
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
-set(_common_compile_options -Wno-deprecated-declarations)
+set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 
 set(_xnnpack_schema__include_dir "${CMAKE_BINARY_DIR}/schema/include")
 # Paths to headers generated from the .fbs files.
@@ -72,7 +72,7 @@ target_include_directories(
   xnnpack_schema INTERFACE ${_xnnpack_schema__include_dir}
                            ${EXECUTORCH_ROOT}/third-party/flatbuffers/include)
 
-set(xnnpack_third_party)
+set(xnnpack_third_party pthreadpool cpuinfo)
 
 include(cmake/Dependencies.cmake)
 
@@ -105,8 +105,7 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*iOS\.cmake$")
   list(TRANSFORM _xnn_executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
   add_executable(xnn_executor_runner ${_xnn_executor_runner__srcs})
   target_link_libraries(xnn_executor_runner
-                        xnnpack_backend gflags portable_ops_lib
-                        pthreadpool cpuinfo)
+                        xnnpack_backend gflags portable_ops_lib)
   target_compile_options(xnn_executor_runner PUBLIC ${_common_compile_options})
 endif()
 
 
@@ -9,6 +9,13 @@
 set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
 
 # --- XNNPACK
+
+# Setting this global PIC flag for all XNNPACK targets.
+# This is needed for Object libraries within XNNPACK which must
+# be PIC to successfully link this static libXNNPACK
+set(ORIGINAL_CMAKE_POSITION_INDEPENDENT_CODE_FLAG ${CMAKE_POSITION_INDEPENDENT_CODE})
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
 set(XNNPACK_SOURCE_DIR "${THIRD_PARTY_ROOT}/XNNPACK")
 set(XNNPACK_INCLUDE_DIR "${XNNPACK_SOURCE_DIR}/include")
 set(XNNPACK_LIBRARY_TYPE "static" CACHE STRING "")
@@ -18,3 +25,6 @@ set(XNNPACK_ENABLE_AVXVNNI OFF CACHE BOOL "")
 add_subdirectory("${XNNPACK_SOURCE_DIR}")
 include_directories(SYSTEM ${XNNPACK_INCLUDE_DIR})
 list(APPEND xnnpack_third_party XNNPACK)
+
+# Revert PIC Flag to what it originally was
+set(CMAKE_POSITION_INDEPENDENT_CODE ${ORIGINAL_CMAKE_POSITION_INDEPENDENT_CODE_FLAG})
@@ -14,7 +14,11 @@
 from executorch.backends.apple.mps.mps_preprocess import MPSBackend
 from executorch.backends.apple.mps.partition.mps_partitioner import MPSPartitioner
 
-from executorch.exir import EdgeCompileConfig, EdgeProgramManager
+from executorch.exir import (
+    EdgeCompileConfig,
+    EdgeProgramManager,
+    ExecutorchProgramManager,
+)
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.backend_details import CompileSpec
 from executorch.exir.capture._config import ExecutorchBackendConfig
@@ -107,17 +111,11 @@
         lowered_module = to_backend(
             MPSBackend.__name__, edge.exported_program(), compile_specs
         )
-        executorch_program = (
-            exir.capture(
-                lowered_module,
-                example_inputs,
-                exir.CaptureConfig(enable_aot=True, _unlift=False),
-            )
-            .to_edge(exir.EdgeCompileConfig(_check_ir_validity=False))
-            .to_executorch(
-                config=ExecutorchBackendConfig(extract_constant_segment=False)
-            )
-        )
+        executorch_program: ExecutorchProgramManager = export_to_edge(
+            lowered_module,
+            example_inputs,
+            edge_compile_config=exir.EdgeCompileConfig(_check_ir_validity=False),
+        ).to_executorch(config=ExecutorchBackendConfig(extract_constant_segment=False))
 
     model_name = f"{args.model_name}_mps"
 
 
@@ -108,6 +108,12 @@ if(TARGET vulkan_backend)
   target_link_options_shared_lib(vulkan_backend)
 endif()
 
+# Qnn backend
+if(TARGET qnn_executorch_backend)
+  list(APPEND link_libraries qnn_executorch_backend)
+  target_link_options_shared_lib(qnn_executorch_backend)
+endif()
+
 # This one is needed for cpuinfo where it uses android
 # specific log lib
 if(ANDROID)