pytorch
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 13 additions & 9 deletions b/‎CMakeLists.txt
Lines changed: 13 additions & 9 deletions
diff --git a/‎backends/qualcomm/builders/op_dequantize.py
Lines changed: 10 additions & 14 deletions b/‎backends/qualcomm/builders/op_dequantize.py
Lines changed: 10 additions & 14 deletions
diff --git a/‎backends/qualcomm/passes/convert_hardsigmoid.py
Lines changed: 4 additions & 2 deletions b/‎backends/qualcomm/passes/convert_hardsigmoid.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎backends/qualcomm/tests/test_qnn_delegate.py
Lines changed: 59 additions & 0 deletions b/‎backends/qualcomm/tests/test_qnn_delegate.py
Lines changed: 59 additions & 0 deletions
diff --git a/‎backends/qualcomm/utils/utils.py
Lines changed: 2 additions & 0 deletions b/‎backends/qualcomm/utils/utils.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/VulkanBackend.cpp
Lines changed: 7 additions & 7 deletions b/‎backends/vulkan/runtime/VulkanBackend.cpp
Lines changed: 7 additions & 7 deletions
diff --git a/‎backends/vulkan/runtime/api/Shader.cpp
Lines changed: 2 additions & 21 deletions b/‎backends/vulkan/runtime/api/Shader.cpp
Lines changed: 2 additions & 21 deletions
diff --git a/‎backends/vulkan/runtime/api/Shader.h
Lines changed: 2 additions & 13 deletions b/‎backends/vulkan/runtime/api/Shader.h
Lines changed: 2 additions & 13 deletions
@@ -2,6 +2,7 @@
 buck-out/
 cmake-out/
 cmake-android-out/
+cmake-out-android/
 cmake-ios-out/
 ethos-u-scratch/
 executorch.egg-info
 
@@ -352,23 +352,27 @@ add_subdirectory(schema)
 # Only contains primitive operators; does not contain portable kernels or other
 # full operators. Does not contain any backends.
 #
-
-add_library(executorch ${_executorch__srcs})
-target_link_libraries(executorch PRIVATE program_schema)
-target_link_options_shared_lib(executorch)
+add_library(executorch_no_prim_ops ${_executorch_no_prim_ops__srcs})
+target_link_libraries(executorch_no_prim_ops PRIVATE program_schema)
 # Check if dl exists for this toolchain and only then link it.
 find_library(DL_LIBRARY_EXISTS NAMES dl)
 # Check if the library was found
 if(DL_LIBRARY_EXISTS)
-  target_link_libraries(executorch PRIVATE dl) # For dladdr()
+  target_link_libraries(executorch_no_prim_ops PRIVATE dl) # For dladdr()
 endif()
-target_include_directories(executorch PUBLIC ${_common_include_directories})
-target_compile_options(executorch PUBLIC ${_common_compile_options})
+target_include_directories(executorch_no_prim_ops PUBLIC ${_common_include_directories})
+target_compile_options(executorch_no_prim_ops PUBLIC ${_common_compile_options})
 if(MAX_KERNEL_NUM)
-  target_compile_definitions(executorch
+  target_compile_definitions(executorch_no_prim_ops
                              PRIVATE MAX_KERNEL_NUM=${MAX_KERNEL_NUM})
 endif()
 
+add_library(executorch ${_executorch__srcs})
+target_link_libraries(executorch PRIVATE executorch_no_prim_ops)
+target_include_directories(executorch PUBLIC ${_common_include_directories})
+target_compile_options(executorch PUBLIC ${_common_compile_options})
+target_link_options_shared_lib(executorch)
+
 #
 # portable_ops_lib: A library to register core ATen ops using portable kernels,
 # see kernels/portable/CMakeLists.txt.
@@ -406,7 +410,7 @@ endif()
 # Install `executorch` library as well as `executorch-config.cmake` under
 # ${CMAKE_INSTALL_PREFIX}/
 install(
-  TARGETS executorch
+  TARGETS executorch executorch_no_prim_ops
   DESTINATION lib
   INCLUDES
   DESTINATION ${_common_include_directories})
 
@@ -56,20 +56,16 @@ def define_node(
 
 
 @register_node_visitor
-class PerTensorDequantizeDefault(DequantizeOpBase):
-    target = ["quantized_decomposed.dequantize_per_tensor.default"]
+class PerTensorDequantize(DequantizeOpBase):
+    target = [
+        "quantized_decomposed.dequantize_per_tensor.default",
+        "quantized_decomposed.dequantize_per_tensor.tensor",
+    ]
 
 
 @register_node_visitor
-class PerTensorDequantizeTensor(DequantizeOpBase):
-    target = ["quantized_decomposed.dequantize_per_tensor.tensor"]
-
-
-@register_node_visitor
-class PerChannelDequantizeDefault(DequantizeOpBase):
-    target = ["quantized_decomposed.dequantize_per_channel.default"]
-
-
-@register_node_visitor
-class PerChannelDequantizeTensor(DequantizeOpBase):
-    target = ["quantized_decomposed.dequantize_per_channel.tensor"]
+class PerChannelDequantize(DequantizeOpBase):
+    target = [
+        "quantized_decomposed.dequantize_per_channel.default",
+        "quantized_decomposed.dequantize_per_channel.tensor",
+    ]
@@ -25,6 +25,10 @@ def call(self, graph_module: torch.fx.GraphModule):
         partitions = get_source_partitions(graph, [torch.nn.Hardsigmoid])
         for _, src_partitions in partitions.items():
             for src_partition in src_partitions:
+                if exir_ops.edge.aten.hardswish.default in [
+                    node.target for node in src_partition.nodes
+                ]:
+                    continue
                 if self.quantization_capture:
                     # only one hardsigmoid op will be seen
                     input_nodes = src_partition.input_nodes
@@ -34,8 +38,6 @@ def call(self, graph_module: torch.fx.GraphModule):
                 else:
                     in_ops_target = exir_ops.edge.aten.add.Tensor
                     out_ops_target = exir_ops.edge.aten.div.Tensor
-                    # see the reverse engineering logic hardswish
-                    # https://shorturl.at/pACEL
                     input_nodes = [
                         n for n in src_partition.nodes if n.target is in_ops_target
                     ]
 
@@ -6,8 +6,10 @@
 import json
 import subprocess
 import sys
+import tempfile
 import unittest
 from multiprocessing.connection import Listener
+from pathlib import Path
 
 import torch
 from executorch.backends.qualcomm.tests.utils import (
@@ -1102,6 +1104,19 @@ def test_qnn_backend_shared_buffer(self):
             expected_partitions=1,
         )
 
+    def test_qnn_backend_online_prepare(self):
+        backend_options = generate_htp_compiler_spec(use_fp16=True)
+        TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+            debug=False,
+            saver=False,
+            online_prepare=True,
+        )
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+        self.lower_module_and_test_output(module, sample_input)
+
 
 class TestQNNQuantizedUtils(TestQNN):
     # TODO: refactor to support different backends
@@ -1223,6 +1238,20 @@ def test_qnn_backend_shared_buffer(self):
             expected_partitions=1,
         )
 
+    def test_qnn_backend_online_prepare(self):
+        backend_options = generate_htp_compiler_spec(use_fp16=False)
+        TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
+            soc_model=self.arch_table[TestQNN.model],
+            backend_options=backend_options,
+            debug=False,
+            saver=False,
+            online_prepare=True,
+        )
+        module = SimpleModel()  # noqa: F405
+        sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
 
 class TestExampleOssScript(TestQNN):
     def required_envs(self, conditions=None) -> bool:
@@ -1640,6 +1669,29 @@ def test_ptq_mobilebert(self):
             for k, v in cpu.items():
                 self.assertLessEqual(abs(v[0] - htp[k][0]), 5)
 
+    def test_export_example(self):
+        if not self.required_envs([self.model_name]):
+            self.skipTest("missing required envs")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            cmds = [
+                "python",
+                "qualcomm/scripts/export_example.py",
+                "--model_name",
+                self.model_name,
+                "--output_folder",
+                "{}/".format(tmp_dir),
+                "--generate_etrecord",
+            ]
+
+            p = subprocess.Popen(
+                cmds, stdout=subprocess.DEVNULL, cwd=f"{self.executorch_root}/examples"
+            )
+            p.communicate()
+            self.assertTrue(
+                Path("{0}/{1}.pte".format(tmp_dir, self.model_name)).exists()
+            )
+
 
 def setup_environment():
     parser = setup_common_args_and_variables()
@@ -1669,6 +1721,12 @@ def setup_environment():
         default="",
         type=str,
     )
+    parser.add_argument(
+        "-n",
+        "--model_name",
+        help="Input the model to export",
+        type=str,
+    )
     parser.add_argument(
         "-o",
         "--online_prepare",
@@ -1697,6 +1755,7 @@ def setup_environment():
     TestQNN.artifact_dir = args.artifact_dir
     TestQNN.image_dataset = args.image_dataset
     TestQNN.pretrained_weight = args.pretrained_weight
+    TestQNN.model_name = args.model_name
     TestQNN.online_prepare = args.online_prepare
     TestQNN.enable_profile = args.enable_profile
     TestQNN.error_only = args.error_only
 
@@ -19,6 +19,7 @@
     ConvertBinaryOpsWithScalar,
 )
 from executorch.backends.qualcomm.passes.convert_bmm_to_matmul import ConvertBmmToMatmul
+from executorch.backends.qualcomm.passes.convert_hardsigmoid import ConvertHardsigmoid
 from executorch.backends.qualcomm.passes.convert_interpolate_with_upsample2d import (
     ConvertInterpolateWithUpsample2D,
 )
@@ -103,6 +104,7 @@ def _transform(edge_program: ExportedProgram) -> None:
     graph_module = edge_program.graph_module
     RemoveClone()(graph_module)
     ConvertToLinear()(graph_module)
+    ConvertHardsigmoid()(graph_module)
     ConvertBmmToMatmul()(graph_module)
     ConvertInterpolateWithUpsample2D()(graph_module)
     I64toI32(edge_program)(graph_module)
 
@@ -77,26 +77,26 @@ api::StorageType get_storage_type(
     const vkgraph::VkStorageType& vk_storage_type) {
   switch (vk_storage_type) {
     case vkgraph::VkStorageType::BUFFER:
-      return api::StorageType::BUFFER;
+      return api::kBuffer;
     case vkgraph::VkStorageType::TEXTURE_3D:
-      return api::StorageType::TEXTURE_3D;
+      return api::kTexture3D;
     case vkgraph::VkStorageType::TEXTURE_2D:
-      return api::StorageType::TEXTURE_2D;
+      return api::kTexture2D;
     default:
       break;
   }
-  return api::StorageType::UNKNOWN;
+  VK_THROW("Invalid storage type encountered!");
 }
 
 api::GPUMemoryLayout get_memory_layout(
     const vkgraph::VkMemoryLayout& vk_memory_layout) {
   switch (vk_memory_layout) {
     case vkgraph::VkMemoryLayout::TENSOR_WIDTH_PACKED:
-      return api::GPUMemoryLayout::TENSOR_WIDTH_PACKED;
+      return api::kWidthPacked;
     case vkgraph::VkMemoryLayout::TENSOR_HEIGHT_PACKED:
-      return api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED;
+      return api::kHeightPacked;
     case vkgraph::VkMemoryLayout::TENSOR_CHANNELS_PACKED:
-      return api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
+      return api::kChannelsPacked;
     default:
       break;
   }
 
@@ -23,38 +23,19 @@ ShaderInfo::ShaderInfo()
           0u,
       } {}
 
-ShaderInfo::ShaderInfo(
-    std::string name,
-    const uint32_t* const spirv_bin,
-    const uint32_t size,
-    std::vector<VkDescriptorType>  layout)
-    : src_code{
-          spirv_bin,
-          size,
-      },
-      kernel_name{std::move(name)},
-      kernel_layout{std::move(layout)} {}
-
 ShaderInfo::ShaderInfo(
     std::string name,
     const uint32_t* const spirv_bin,
     const uint32_t size,
     std::vector<VkDescriptorType>  layout,
-    const std::vector<uint32_t>& tile_size,
-    const StorageType bias_storage_type,
-    const StorageType weight_storage_type)
+    const utils::uvec3 tile_size)
     : src_code{
           spirv_bin,
           size,
       },
       kernel_name{std::move(name)},
       kernel_layout{std::move(layout)},
-      tile_size(tile_size),
-      bias_storage_type(bias_storage_type),
-      weight_storage_type(weight_storage_type) {
-  for (uint64_t i = 0; i < tile_size.size(); ++i) {
-    out_tile_size.data[i] = tile_size[i];
-  }
+      out_tile_size(tile_size) {
 }
 
 bool operator==(const ShaderInfo& _1, const ShaderInfo& _2) {
 
@@ -62,25 +62,14 @@ struct ShaderInfo final {
   // Shader Metadata
   utils::uvec3 out_tile_size{1u, 1u, 1u};
 
-  std::vector<uint32_t> tile_size;
-  StorageType bias_storage_type{StorageType::UNKNOWN};
-  StorageType weight_storage_type{StorageType::UNKNOWN};
-
   explicit ShaderInfo();
-  explicit ShaderInfo(std::string, const char*);
-  explicit ShaderInfo(
-      std::string,
-      const uint32_t*,
-      const uint32_t,
-      std::vector<VkDescriptorType>);
+
   explicit ShaderInfo(
       std::string,
       const uint32_t*,
       const uint32_t,
       std::vector<VkDescriptorType>,
-      const std::vector<uint32_t>& tile_size,
-      const StorageType bias_storage_type,
-      const StorageType weight_storage_type);
+      const utils::uvec3 tile_size);
 };
 
 bool operator==(const ShaderInfo& _1, const ShaderInfo& _2);