pytorch
diff --git a/‎backends/qualcomm/CMakeLists.txt
Lines changed: 12 additions & 0 deletions b/‎backends/qualcomm/CMakeLists.txt
Lines changed: 12 additions & 0 deletions
diff --git a/‎backends/qualcomm/passes/i64_to_i32.py
Lines changed: 1 addition & 1 deletion b/‎backends/qualcomm/passes/i64_to_i32.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/qualcomm/qnn_preprocess.py
Lines changed: 4 additions & 2 deletions b/‎backends/qualcomm/qnn_preprocess.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
Lines changed: 6 additions & 1 deletion b/‎backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
Lines changed: 6 additions & 1 deletion
diff --git a/‎backends/qualcomm/runtime/QnnManager.cpp
Lines changed: 17 additions & 0 deletions b/‎backends/qualcomm/runtime/QnnManager.cpp
Lines changed: 17 additions & 0 deletions
diff --git a/‎backends/qualcomm/runtime/QnnManager.h
Lines changed: 2 additions & 0 deletions b/‎backends/qualcomm/runtime/QnnManager.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/qualcomm/runtime/backends/CMakeLists.txt
Lines changed: 7 additions & 0 deletions b/‎backends/qualcomm/runtime/backends/CMakeLists.txt
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/qualcomm/runtime/backends/QnnBackendCommon.h
Lines changed: 4 additions & 0 deletions b/‎backends/qualcomm/runtime/backends/QnnBackendCommon.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
Lines changed: 2 additions & 0 deletions b/‎backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/qualcomm/runtime/backends/QnnGraphCommon.cpp
Lines changed: 16 additions & 0 deletions b/‎backends/qualcomm/runtime/backends/QnnGraphCommon.cpp
Lines changed: 16 additions & 0 deletions
diff --git a/‎backends/qualcomm/runtime/backends/QnnGraphCommon.h
Lines changed: 12 additions & 11 deletions b/‎backends/qualcomm/runtime/backends/QnnGraphCommon.h
Lines changed: 12 additions & 11 deletions
diff --git a/‎backends/qualcomm/runtime/backends/QnnProfiler.cpp
Lines changed: 122 additions & 0 deletions b/‎backends/qualcomm/runtime/backends/QnnProfiler.cpp
Lines changed: 122 additions & 0 deletions
@@ -131,6 +131,7 @@ add_library(qnn_implementation STATIC)
 add_library(qnn_sys_function_interface INTERFACE)
 add_library(qnn_sys_implementation STATIC)
 add_library(qnn_logger STATIC)
+add_library(qnn_profiler STATIC)
 add_library(qnn_device STATIC)
 add_library(qnn_context STATIC)
 add_library(qnn_backend_cache STATIC)
@@ -179,6 +180,10 @@ target_link_libraries(qnn_executorch_logging
     PRIVATE
     qnn_schema
 )
+target_link_libraries(qnn_profiler
+    PRIVATE
+    qnn_executorch_logging
+)
 target_link_libraries(qnn_logger
     PRIVATE
     qnn_implementation
@@ -213,6 +218,7 @@ target_link_libraries(qnn_graph
     qnn_executorch_logging
     qnn_implementation
     qnn_context
+    qnn_profiler
 )
 target_link_libraries(qnn_factory
     PUBLIC
@@ -249,6 +255,12 @@ target_link_libraries(utils
 #
 target_link_options_shared_lib(qnn_executorch_backend)
 
+#
+# add compile option
+#
+target_compile_options(executorch PUBLIC -DET_EVENT_TRACER_ENABLED)
+
+
 #
 # add sources
 #
 
@@ -30,7 +30,7 @@ def _update_meta(self, node: torch.fx.node) -> None:
             )
         else:
             if meta_val.dtype == torch.int64:
-                node.meta["val"] = meta_val.to(torch.int32)
+                node.meta["val"] = meta_val.to(torch.float)
 
     def _cast_to_int32(self, graph_module: torch.fx.GraphModule):
         for n in graph_module.graph.nodes:
 
@@ -89,5 +89,7 @@ def preprocess(
         )
         assert len(qnn_context_binary) != 0, "Failed to generate Qnn context binary."
         qnn_manager.Destroy()
-
-        return PreprocessResult(bytes(qnn_context_binary))
+        # For now, debug_handle_map is not used by QNN ExecuTorch
+        return PreprocessResult(
+            processed_bytes=bytes(qnn_context_binary), debug_handle_map={}
+        )
@@ -176,7 +176,7 @@ Result<DelegateHandle*> QnnExecuTorchBackend::init(
 }
 
 Error QnnExecuTorchBackend::execute(
-    __ET_UNUSED BackendExecutionContext& context,
+    BackendExecutionContext& context,
     DelegateHandle* handle,
     EValue** args) const {
   QnnManager* qnn_manager = static_cast<QnnManager*>(handle);
@@ -211,6 +211,11 @@ Error QnnExecuTorchBackend::execute(
           Error::Ok,
       Internal,
       "Fail to execute graph");
+  ET_CHECK_OR_RETURN_ERROR(
+      qnn_manager->ProfileExecuteData(context.event_tracer()) == Error::Ok,
+      Internal,
+      "Fail to profile graph");
+
   return Error::Ok;
 }
 
 
@@ -47,6 +47,9 @@ QnnManager::QnnManager(
         options_->tensor_dump_output_path()->c_str());
     QNN_EXECUTORCH_LOG_INFO(
         "log_level: %s", EnumNameQnnExecuTorchLogLevel(options_->log_level()));
+    QNN_EXECUTORCH_LOG_INFO(
+        "profile_level: %s",
+        EnumNameQnnExecuTorchProfileLevel(options_->profile_level()));
     QNN_EXECUTORCH_LOG_INFO(
         "the size of qnn context binary: %d",
         qnn_executorch_context_binary.nbytes);
@@ -194,6 +197,20 @@ Error QnnManager::Execute(
   return Error::Ok;
 }
 
+Error QnnManager::ProfileExecuteData(EventTracer* event_tracer) {
+  Qnn_ErrorHandle_t error = QNN_SUCCESS;
+  if (options_->profile_level() != QnnExecuTorchProfileLevel::kProfileOff) {
+    error =
+        backend_params_ptr_->qnn_graph_ptr_->ProfileExecuteData(event_tracer);
+    if (error != QNN_SUCCESS) {
+      QNN_EXECUTORCH_LOG_ERROR(
+          " Failed to profile. Error %d", QNN_GET_ERROR_CODE(error));
+      return Error::Internal;
+    }
+  }
+  return Error::Ok;
+}
+
 void QnnManager::Destroy() {
   QNN_EXECUTORCH_LOG_INFO("Destroy Qnn backend parameters");
   backend_params_ptr_.reset(new BackendConfigParameters());
 
@@ -38,6 +38,8 @@ class QnnManager {
       const std::vector<Qnn_Tensor_t>& input_tensor_structs,
       std::vector<Qnn_Tensor_t>& output_tensor_structs);
 
+  Error ProfileExecuteData(EventTracer* event_tracer);
+
   void Destroy();
 
   bool IsAvailable();
 
@@ -41,6 +41,13 @@ target_sources(qnn_logger
     ${CMAKE_CURRENT_LIST_DIR}/QnnLogger.cpp
 )
 
+# qnn_profiler
+target_sources(qnn_profiler
+    PRIVATE
+    ${CMAKE_CURRENT_LIST_DIR}/QnnProfiler.h
+    ${CMAKE_CURRENT_LIST_DIR}/QnnProfiler.cpp
+)
+
 # qnn_device
 set(HOST_ARCHITECTURE
     ${CMAKE_CURRENT_LIST_DIR}/htpbackend/${CMAKE_SYSTEM_PROCESSOR}
 
@@ -27,6 +27,10 @@ class QnnBackend {
       : handle_(nullptr), implementation_(implementation), logger_(logger) {}
 
   virtual ~QnnBackend();
+  virtual bool IsProfileEventTypeParentOfNodeTime(
+      QnnProfile_EventType_t /*event_type*/) {
+    return false;
+  }
 
   Error Configure();
 
 
@@ -63,7 +63,9 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
 
       backend_params->qnn_graph_ptr_ = std::make_unique<HtpGraph>(
           implementation,
+          backend_params->qnn_backend_ptr_.get(),
           backend_params->qnn_context_ptr_.get(),
+          options->profile_level(),
           options->graph_name()->str(),
           options->soc_info(),
           htp_options);
 
@@ -51,9 +51,25 @@ Error QnnGraph::Configure() {
     return Error::Internal;
   }
 
+  // The profiler needs to be created after the backend is created.
+  profile_ =
+      std::make_unique<QnnProfile>(implementation_, backend_, profile_level_);
   return Error::Ok;
 }
 
+Qnn_ErrorHandle_t QnnGraph::GraphExecute(
+    const std::vector<Qnn_Tensor_t>& input_tensor_structs,
+    std::vector<Qnn_Tensor_t>& output_tensor_structs) {
+  return implementation_.GetQnnInterface().qnn_graph_execute(
+      handle_,
+      input_tensor_structs.data(),
+      input_tensor_structs.size(),
+      output_tensor_structs.data(),
+      output_tensor_structs.size(),
+      profile_->GetHandle(),
+      /*signalHandle=*/nullptr);
+};
+
 Error QnnGraph::EnsureTensorInQnnGraph(
     const std::shared_ptr<TensorWrapper>& tensor_wrapper) {
   const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
 
@@ -11,6 +11,7 @@
 #include <executorch/backends/qualcomm/runtime/Logging.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnContextCommon.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
+#include <executorch/backends/qualcomm/runtime/backends/QnnProfiler.h>
 
 #include <vector>
 
@@ -23,11 +24,15 @@ class QnnGraph {
  public:
   explicit QnnGraph(
       const QnnImplementation& implementation,
+      QnnBackend* backend,
       QnnContext* context,
+      const QnnExecuTorchProfileLevel& profile_level,
       const std::string& graph_name)
       : handle_(nullptr),
         implementation_(implementation),
+        backend_(backend),
         context_(context),
+        profile_level_(profile_level),
         graph_name_(graph_name) {}
 
   virtual ~QnnGraph(){};
@@ -36,16 +41,7 @@ class QnnGraph {
 
   Qnn_ErrorHandle_t GraphExecute(
       const std::vector<Qnn_Tensor_t>& input_tensor_structs,
-      std::vector<Qnn_Tensor_t>& output_tensor_structs) {
-    return implementation_.GetQnnInterface().qnn_graph_execute(
-        handle_,
-        input_tensor_structs.data(),
-        input_tensor_structs.size(),
-        output_tensor_structs.data(),
-        output_tensor_structs.size(),
-        /*profile=*/nullptr,
-        /*signalHandle=*/nullptr);
-  };
+      std::vector<Qnn_Tensor_t>& output_tensor_structs);
 
   Qnn_ErrorHandle_t GraphAddNode(const Qnn_OpConfig_t& op_config) {
     return implementation_.GetQnnInterface().qnn_graph_add_node(
@@ -58,7 +54,9 @@ class QnnGraph {
     return implementation_.GetQnnInterface().qnn_graph_finalize(
         handle_, nullptr /* profile_handle */, nullptr /* signal_handle */);
   };
-
+  Qnn_ErrorHandle_t ProfileExecuteData(EventTracer* event_tracer) {
+    return profile_->ProfileData(event_tracer);
+  };
   Qnn_GraphHandle_t GetHandle() {
     return handle_;
   }
@@ -71,8 +69,11 @@ class QnnGraph {
  private:
   Qnn_GraphHandle_t handle_;
   const QnnImplementation& implementation_;
+  QnnBackend* backend_;
   QnnContext* context_;
+  QnnExecuTorchProfileLevel profile_level_;
   std::string graph_name_;
+  std::unique_ptr<QnnProfile> profile_;
 };
 } // namespace qnn
 } // namespace executor
 
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/qualcomm/runtime/backends/QnnProfiler.h>
+#include <iostream>
+
+namespace torch {
+namespace executor {
+namespace qnn {
+
+QnnProfile::QnnProfile(
+    const QnnImplementation& implementation,
+    QnnBackend* backend,
+    const QnnExecuTorchProfileLevel& profile_level)
+    : handle_(nullptr), implementation_(implementation), backend_(backend) {
+  if (profile_level != QnnExecuTorchProfileLevel::kProfileOff) {
+    const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
+    Qnn_ErrorHandle_t error = qnn_interface.qnn_profile_create(
+        backend_->GetHandle(), static_cast<int>(profile_level), &handle_);
+    if (error != QNN_SUCCESS) {
+      QNN_EXECUTORCH_LOG_WARN(
+          "Failed to create profile_handle for backend "
+          " %u, error=%d",
+          qnn_interface.GetBackendId(),
+          QNN_GET_ERROR_CODE(error));
+
+      // ignore error and continue to create backend handle...
+      handle_ = nullptr;
+    }
+  }
+}
+
+Qnn_ErrorHandle_t QnnProfile::ProfileData(EventTracer* event_tracer) {
+  const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
+  const QnnProfile_EventId_t* events_ptr = nullptr;
+  const QnnProfile_EventId_t* sub_events_ptr = nullptr;
+  std::uint32_t num_events = 0;
+  std::uint32_t num_sub_events = 0;
+  Qnn_ErrorHandle_t error =
+      qnn_interface.qnn_profile_get_events(handle_, &events_ptr, &num_events);
+  if (error != QNN_SUCCESS) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "ProfileData failed to get events: %d", QNN_GET_ERROR_CODE(error));
+    return error;
+  }
+  QnnProfile_EventData_t event_data;
+  for (std::uint32_t i = 0; i < num_events; ++i) {
+    error =
+        qnn_interface.qnn_profile_get_event_data(events_ptr[i], &event_data);
+    if (error != QNN_SUCCESS) {
+      QNN_EXECUTORCH_LOG_ERROR(
+          "ProfileData failed to get event data "
+          "for event %d: %d",
+          i,
+          QNN_GET_ERROR_CODE(error));
+      return error;
+    }
+    // Check an event's sub events only if it relates to graph execution time
+    // (and its sub events are the individual op executions):
+    if (backend_->IsProfileEventTypeParentOfNodeTime(event_data.type)) {
+      error = qnn_interface.qnn_profile_get_sub_events(
+          events_ptr[i], &sub_events_ptr, &num_sub_events);
+      if (error != QNN_SUCCESS) {
+        QNN_EXECUTORCH_LOG_ERROR(
+            "ProfileData failed to get sub events "
+            "for event %d: %d",
+            i,
+            QNN_GET_ERROR_CODE(error));
+        return error;
+      }
+      QnnProfile_EventData_t sub_event_data;
+      for (std::uint32_t j = 0; j < num_sub_events; ++j) {
+        error = qnn_interface.qnn_profile_get_event_data(
+            sub_events_ptr[j], &sub_event_data);
+        if (error != QNN_SUCCESS) {
+          QNN_EXECUTORCH_LOG_ERROR(
+              "ProfileData failed to get sub "
+              "event data for sub event %d of event %d: %d",
+              j,
+              i,
+              QNN_GET_ERROR_CODE(error));
+          return error;
+        }
+        if (sub_event_data.type == QNN_PROFILE_EVENTTYPE_NODE &&
+            (sub_event_data.unit == QNN_PROFILE_EVENTUNIT_MICROSEC ||
+             sub_event_data.unit == QNN_PROFILE_EVENTUNIT_CYCLES)) {
+          torch::executor::event_tracer_log_profiling_delegate(
+              event_tracer,
+              sub_event_data.identifier,
+              /*delegate_debug_id=*/
+              static_cast<torch::executor::DebugHandle>(-1),
+              0,
+              sub_event_data.value);
+        }
+      }
+    }
+  }
+  return error;
+}
+
+QnnProfile::~QnnProfile() {
+  const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
+  if (handle_ != nullptr) {
+    Qnn_ErrorHandle_t error = qnn_interface.qnn_profile_free(handle_);
+    if (error != QNN_SUCCESS) {
+      QNN_EXECUTORCH_LOG_ERROR(
+          "Failed to free QNN profile_handle. Backend "
+          "ID %u, error %d",
+          qnn_interface.GetBackendId(),
+          QNN_GET_ERROR_CODE(error));
+    }
+    handle_ = nullptr;
+  }
+}
+} // namespace qnn
+} // namespace executor
+} // namespace torch
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ def _update_meta(self, node: torch.fx.node) -> None:`
`30`	`30`	`)`
`31`	`31`	`else:`
`32`	`32`	`if meta_val.dtype == torch.int64:`
`33`		`- node.meta["val"] = meta_val.to(torch.int32)`
	`33`	`+ node.meta["val"] = meta_val.to(torch.float)`
`34`	`34`
`35`	`35`	`def _cast_to_int32(self, graph_module: torch.fx.GraphModule):`
`36`	`36`	`for n in graph_module.graph.nodes:`