Skip to content

Commit 0b12daf

Browse files
shewu-quicfacebook-github-bot
authored andcommitted
Qualcomm AI Engine Direct - Implement sdk profiler and intergrate with Qnn profiler (#2227)
Summary: - Implement Qnn Profiler for htp backend For now, only support kProfileDetailed to profile the performance of each operator with cycle unit. Follow up item: Add more qnn profile item - Intergrated with sdk profiler - Add the argument etdump_path to dump etdump which analyzes the contents by INSPECTOR in qnn_executorch_runner - Add unit test to test profile - Add export example to generate etrecord Reproduce commands: ``` python3 backends/qualcomm/tests/test_qnn_delegate.py TestQNNFloatingPointOperator.test_qnn_backend_conv2d -b /local3/mnt/workspace/shewu/executorch/executorch_shewu/executorch/build_android -s $LANAI1 -H $TWL1 -m SM8650 -r /local3/mnt/workspace/shewu/executorch/executorch_shewu/executorch -a /local3/mnt/workspace/shewu/executorch/unit_test -i /local3/mnt/workspace/shewu/executorch/models/data/ImageNet-Mini/images --enable_profile # Pull the EtDump adb pull /data/local/tmp/qnn_executorch_test/etdump.etdp . # Run inspector to produce the following table python3 -m sdk.inspector.inspector_cli --etdump_path etdump.etdp --etrecord_path etrecord.bin ``` ``` ╒════╤════════════════════╤══════════════════════════════════════════════╤═════════════╤═════════════╤═════════════╤═════════════╤═════════════╤═════════════╤════════════╤═══════════════════╤═════════════════════════╕ │ │ event_block_name │ event_name │ p10 (ms) │ p50 (ms) │ p90 (ms) │ avg (ms) │ min (ms) │ max (ms) │ op_types │ is_delegated_op │ delegate_backend_name │ ╞════╪════════════════════╪══════════════════════════════════════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪═════════════╪════════════╪═══════════════════╪═════════════════════════╡ │ 0 │ Default │ Method::init │ 123.898 │ 123.898 │ 123.898 │ 123.898 │ 123.898 │ 123.898 │ [] │ False │ │ ├────┼────────────────────┼──────────────────────────────────────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼────────────┼───────────────────┼─────────────────────────┤ │ 1 │ Default │ Program::load_method │ 123.926 │ 123.926 │ 123.926 │ 123.926 │ 123.926 │ 123.926 │ [] │ False │ │ ├────┼────────────────────┼──────────────────────────────────────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼────────────┼───────────────────┼─────────────────────────┤ │ 2 │ Execute │ Input OpId_2 (cycles) │ 4018 │ 4018 │ 4018 │ 4018 │ 4018 │ 4018 │ [] │ True │ QnnBackend │ ├────┼────────────────────┼──────────────────────────────────────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼────────────┼───────────────────┼─────────────────────────┤ │ 3 │ Execute │ aten_permute_copy_default:OpId_17 (cycles) │ 16765 │ 16765 │ 16765 │ 16765 │ 16765 │ 16765 │ [] │ True │ QnnBackend │ ├────┼────────────────────┼──────────────────────────────────────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼────────────┼───────────────────┼─────────────────────────┤ │ 4 │ Execute │ aten_convolution_default:OpId_23 (cycles) │ 12768 │ 12768 │ 12768 │ 12768 │ 12768 │ 12768 │ [] │ True │ QnnBackend │ ├────┼────────────────────┼──────────────────────────────────────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼────────────┼───────────────────┼─────────────────────────┤ │ 5 │ Execute │ aten_convolution_default_1:OpId_30 (cycles) │ 9439 │ 9439 │ 9439 │ 9439 │ 9439 │ 9439 │ [] │ True │ QnnBackend │ ├────┼────────────────────┼──────────────────────────────────────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼────────────┼───────────────────┼─────────────────────────┤ │ 6 │ Execute │ aten_permute_copy_default_1:OpId_33 (cycles) │ 2551 │ 2551 │ 2551 │ 2551 │ 2551 │ 2551 │ [] │ True │ QnnBackend │ ├────┼────────────────────┼──────────────────────────────────────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼────────────┼───────────────────┼─────────────────────────┤ │ 7 │ Execute │ OpId_0 (cycles) │ 0 │ 0 │ 0 │ 0 │ 0 │ 0 │ [] │ True │ QnnBackend │ ├────┼────────────────────┼──────────────────────────────────────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼────────────┼───────────────────┼─────────────────────────┤ │ 8 │ Execute │ Output OpId_3 (cycles) │ 3054 │ 3054 │ 3054 │ 3054 │ 3054 │ 3054 │ [] │ True │ QnnBackend │ ├────┼────────────────────┼──────────────────────────────────────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼────────────┼───────────────────┼─────────────────────────┤ │ 9 │ Execute │ DELEGATE_CALL │ 1.17151 │ 1.17151 │ 1.17151 │ 1.17151 │ 1.17151 │ 1.17151 │ [] │ False │ QnnBackend │ ├────┼────────────────────┼──────────────────────────────────────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼────────────┼───────────────────┼─────────────────────────┤ │ 10 │ Execute │ Method::execute │ 1.18318 │ 1.18318 │ 1.18318 │ 1.18318 │ 1.18318 │ 1.18318 │ [] │ False │ │ ╘════╧════════════════════╧══════════════════════════════════════════════╧═════════════╧═════════════╧═════════════╧═════════════╧═════════════╧═════════════╧════════════╧═══════════════════╧═════════════════════════╛ ``` Pull Request resolved: #2227 Reviewed By: mergennachin Differential Revision: D55224350 Pulled By: Olivia-liu fbshipit-source-id: 03b005524808476a63401048525bed39837e266c
1 parent 3152d7f commit 0b12daf

26 files changed

+441
-22
lines changed

backends/qualcomm/CMakeLists.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ add_library(qnn_implementation STATIC)
131131
add_library(qnn_sys_function_interface INTERFACE)
132132
add_library(qnn_sys_implementation STATIC)
133133
add_library(qnn_logger STATIC)
134+
add_library(qnn_profiler STATIC)
134135
add_library(qnn_device STATIC)
135136
add_library(qnn_context STATIC)
136137
add_library(qnn_backend_cache STATIC)
@@ -179,6 +180,10 @@ target_link_libraries(qnn_executorch_logging
179180
PRIVATE
180181
qnn_schema
181182
)
183+
target_link_libraries(qnn_profiler
184+
PRIVATE
185+
qnn_executorch_logging
186+
)
182187
target_link_libraries(qnn_logger
183188
PRIVATE
184189
qnn_implementation
@@ -213,6 +218,7 @@ target_link_libraries(qnn_graph
213218
qnn_executorch_logging
214219
qnn_implementation
215220
qnn_context
221+
qnn_profiler
216222
)
217223
target_link_libraries(qnn_factory
218224
PUBLIC
@@ -249,6 +255,12 @@ target_link_libraries(utils
249255
#
250256
target_link_options_shared_lib(qnn_executorch_backend)
251257

258+
#
259+
# add compile option
260+
#
261+
target_compile_options(executorch PUBLIC -DET_EVENT_TRACER_ENABLED)
262+
263+
252264
#
253265
# add sources
254266
#

backends/qualcomm/passes/i64_to_i32.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def _update_meta(self, node: torch.fx.node) -> None:
3030
)
3131
else:
3232
if meta_val.dtype == torch.int64:
33-
node.meta["val"] = meta_val.to(torch.int32)
33+
node.meta["val"] = meta_val.to(torch.float)
3434

3535
def _cast_to_int32(self, graph_module: torch.fx.GraphModule):
3636
for n in graph_module.graph.nodes:

backends/qualcomm/qnn_preprocess.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,5 +89,7 @@ def preprocess(
8989
)
9090
assert len(qnn_context_binary) != 0, "Failed to generate Qnn context binary."
9191
qnn_manager.Destroy()
92-
93-
return PreprocessResult(bytes(qnn_context_binary))
92+
# For now, debug_handle_map is not used by QNN ExecuTorch
93+
return PreprocessResult(
94+
processed_bytes=bytes(qnn_context_binary), debug_handle_map={}
95+
)

backends/qualcomm/runtime/QnnExecuTorchBackend.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ Result<DelegateHandle*> QnnExecuTorchBackend::init(
176176
}
177177

178178
Error QnnExecuTorchBackend::execute(
179-
__ET_UNUSED BackendExecutionContext& context,
179+
BackendExecutionContext& context,
180180
DelegateHandle* handle,
181181
EValue** args) const {
182182
QnnManager* qnn_manager = static_cast<QnnManager*>(handle);
@@ -211,6 +211,11 @@ Error QnnExecuTorchBackend::execute(
211211
Error::Ok,
212212
Internal,
213213
"Fail to execute graph");
214+
ET_CHECK_OR_RETURN_ERROR(
215+
qnn_manager->ProfileExecuteData(context.event_tracer()) == Error::Ok,
216+
Internal,
217+
"Fail to profile graph");
218+
214219
return Error::Ok;
215220
}
216221

backends/qualcomm/runtime/QnnManager.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ QnnManager::QnnManager(
4747
options_->tensor_dump_output_path()->c_str());
4848
QNN_EXECUTORCH_LOG_INFO(
4949
"log_level: %s", EnumNameQnnExecuTorchLogLevel(options_->log_level()));
50+
QNN_EXECUTORCH_LOG_INFO(
51+
"profile_level: %s",
52+
EnumNameQnnExecuTorchProfileLevel(options_->profile_level()));
5053
QNN_EXECUTORCH_LOG_INFO(
5154
"the size of qnn context binary: %d",
5255
qnn_executorch_context_binary.nbytes);
@@ -194,6 +197,20 @@ Error QnnManager::Execute(
194197
return Error::Ok;
195198
}
196199

200+
Error QnnManager::ProfileExecuteData(EventTracer* event_tracer) {
201+
Qnn_ErrorHandle_t error = QNN_SUCCESS;
202+
if (options_->profile_level() != QnnExecuTorchProfileLevel::kProfileOff) {
203+
error =
204+
backend_params_ptr_->qnn_graph_ptr_->ProfileExecuteData(event_tracer);
205+
if (error != QNN_SUCCESS) {
206+
QNN_EXECUTORCH_LOG_ERROR(
207+
" Failed to profile. Error %d", QNN_GET_ERROR_CODE(error));
208+
return Error::Internal;
209+
}
210+
}
211+
return Error::Ok;
212+
}
213+
197214
void QnnManager::Destroy() {
198215
QNN_EXECUTORCH_LOG_INFO("Destroy Qnn backend parameters");
199216
backend_params_ptr_.reset(new BackendConfigParameters());

backends/qualcomm/runtime/QnnManager.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ class QnnManager {
3838
const std::vector<Qnn_Tensor_t>& input_tensor_structs,
3939
std::vector<Qnn_Tensor_t>& output_tensor_structs);
4040

41+
Error ProfileExecuteData(EventTracer* event_tracer);
42+
4143
void Destroy();
4244

4345
bool IsAvailable();

backends/qualcomm/runtime/backends/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,13 @@ target_sources(qnn_logger
4141
${CMAKE_CURRENT_LIST_DIR}/QnnLogger.cpp
4242
)
4343

44+
# qnn_profiler
45+
target_sources(qnn_profiler
46+
PRIVATE
47+
${CMAKE_CURRENT_LIST_DIR}/QnnProfiler.h
48+
${CMAKE_CURRENT_LIST_DIR}/QnnProfiler.cpp
49+
)
50+
4451
# qnn_device
4552
set(HOST_ARCHITECTURE
4653
${CMAKE_CURRENT_LIST_DIR}/htpbackend/${CMAKE_SYSTEM_PROCESSOR}

backends/qualcomm/runtime/backends/QnnBackendCommon.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ class QnnBackend {
2727
: handle_(nullptr), implementation_(implementation), logger_(logger) {}
2828

2929
virtual ~QnnBackend();
30+
virtual bool IsProfileEventTypeParentOfNodeTime(
31+
QnnProfile_EventType_t /*event_type*/) {
32+
return false;
33+
}
3034

3135
Error Configure();
3236

backends/qualcomm/runtime/backends/QnnBackendFactory.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,9 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
6363

6464
backend_params->qnn_graph_ptr_ = std::make_unique<HtpGraph>(
6565
implementation,
66+
backend_params->qnn_backend_ptr_.get(),
6667
backend_params->qnn_context_ptr_.get(),
68+
options->profile_level(),
6769
options->graph_name()->str(),
6870
options->soc_info(),
6971
htp_options);

backends/qualcomm/runtime/backends/QnnGraphCommon.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,25 @@ Error QnnGraph::Configure() {
5151
return Error::Internal;
5252
}
5353

54+
// The profiler needs to be created after the backend is created.
55+
profile_ =
56+
std::make_unique<QnnProfile>(implementation_, backend_, profile_level_);
5457
return Error::Ok;
5558
}
5659

60+
Qnn_ErrorHandle_t QnnGraph::GraphExecute(
61+
const std::vector<Qnn_Tensor_t>& input_tensor_structs,
62+
std::vector<Qnn_Tensor_t>& output_tensor_structs) {
63+
return implementation_.GetQnnInterface().qnn_graph_execute(
64+
handle_,
65+
input_tensor_structs.data(),
66+
input_tensor_structs.size(),
67+
output_tensor_structs.data(),
68+
output_tensor_structs.size(),
69+
profile_->GetHandle(),
70+
/*signalHandle=*/nullptr);
71+
};
72+
5773
Error QnnGraph::EnsureTensorInQnnGraph(
5874
const std::shared_ptr<TensorWrapper>& tensor_wrapper) {
5975
const QnnInterface& qnn_interface = implementation_.GetQnnInterface();

backends/qualcomm/runtime/backends/QnnGraphCommon.h

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <executorch/backends/qualcomm/runtime/Logging.h>
1212
#include <executorch/backends/qualcomm/runtime/backends/QnnContextCommon.h>
1313
#include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
14+
#include <executorch/backends/qualcomm/runtime/backends/QnnProfiler.h>
1415

1516
#include <vector>
1617

@@ -23,11 +24,15 @@ class QnnGraph {
2324
public:
2425
explicit QnnGraph(
2526
const QnnImplementation& implementation,
27+
QnnBackend* backend,
2628
QnnContext* context,
29+
const QnnExecuTorchProfileLevel& profile_level,
2730
const std::string& graph_name)
2831
: handle_(nullptr),
2932
implementation_(implementation),
33+
backend_(backend),
3034
context_(context),
35+
profile_level_(profile_level),
3136
graph_name_(graph_name) {}
3237

3338
virtual ~QnnGraph(){};
@@ -36,16 +41,7 @@ class QnnGraph {
3641

3742
Qnn_ErrorHandle_t GraphExecute(
3843
const std::vector<Qnn_Tensor_t>& input_tensor_structs,
39-
std::vector<Qnn_Tensor_t>& output_tensor_structs) {
40-
return implementation_.GetQnnInterface().qnn_graph_execute(
41-
handle_,
42-
input_tensor_structs.data(),
43-
input_tensor_structs.size(),
44-
output_tensor_structs.data(),
45-
output_tensor_structs.size(),
46-
/*profile=*/nullptr,
47-
/*signalHandle=*/nullptr);
48-
};
44+
std::vector<Qnn_Tensor_t>& output_tensor_structs);
4945

5046
Qnn_ErrorHandle_t GraphAddNode(const Qnn_OpConfig_t& op_config) {
5147
return implementation_.GetQnnInterface().qnn_graph_add_node(
@@ -58,7 +54,9 @@ class QnnGraph {
5854
return implementation_.GetQnnInterface().qnn_graph_finalize(
5955
handle_, nullptr /* profile_handle */, nullptr /* signal_handle */);
6056
};
61-
57+
Qnn_ErrorHandle_t ProfileExecuteData(EventTracer* event_tracer) {
58+
return profile_->ProfileData(event_tracer);
59+
};
6260
Qnn_GraphHandle_t GetHandle() {
6361
return handle_;
6462
}
@@ -71,8 +69,11 @@ class QnnGraph {
7169
private:
7270
Qnn_GraphHandle_t handle_;
7371
const QnnImplementation& implementation_;
72+
QnnBackend* backend_;
7473
QnnContext* context_;
74+
QnnExecuTorchProfileLevel profile_level_;
7575
std::string graph_name_;
76+
std::unique_ptr<QnnProfile> profile_;
7677
};
7778
} // namespace qnn
7879
} // namespace executor
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
/*
2+
* Copyright (c) Qualcomm Innovation Center, Inc.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/qualcomm/runtime/backends/QnnProfiler.h>
10+
#include <iostream>
11+
12+
namespace torch {
13+
namespace executor {
14+
namespace qnn {
15+
16+
QnnProfile::QnnProfile(
17+
const QnnImplementation& implementation,
18+
QnnBackend* backend,
19+
const QnnExecuTorchProfileLevel& profile_level)
20+
: handle_(nullptr), implementation_(implementation), backend_(backend) {
21+
if (profile_level != QnnExecuTorchProfileLevel::kProfileOff) {
22+
const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
23+
Qnn_ErrorHandle_t error = qnn_interface.qnn_profile_create(
24+
backend_->GetHandle(), static_cast<int>(profile_level), &handle_);
25+
if (error != QNN_SUCCESS) {
26+
QNN_EXECUTORCH_LOG_WARN(
27+
"Failed to create profile_handle for backend "
28+
" %u, error=%d",
29+
qnn_interface.GetBackendId(),
30+
QNN_GET_ERROR_CODE(error));
31+
32+
// ignore error and continue to create backend handle...
33+
handle_ = nullptr;
34+
}
35+
}
36+
}
37+
38+
Qnn_ErrorHandle_t QnnProfile::ProfileData(EventTracer* event_tracer) {
39+
const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
40+
const QnnProfile_EventId_t* events_ptr = nullptr;
41+
const QnnProfile_EventId_t* sub_events_ptr = nullptr;
42+
std::uint32_t num_events = 0;
43+
std::uint32_t num_sub_events = 0;
44+
Qnn_ErrorHandle_t error =
45+
qnn_interface.qnn_profile_get_events(handle_, &events_ptr, &num_events);
46+
if (error != QNN_SUCCESS) {
47+
QNN_EXECUTORCH_LOG_ERROR(
48+
"ProfileData failed to get events: %d", QNN_GET_ERROR_CODE(error));
49+
return error;
50+
}
51+
QnnProfile_EventData_t event_data;
52+
for (std::uint32_t i = 0; i < num_events; ++i) {
53+
error =
54+
qnn_interface.qnn_profile_get_event_data(events_ptr[i], &event_data);
55+
if (error != QNN_SUCCESS) {
56+
QNN_EXECUTORCH_LOG_ERROR(
57+
"ProfileData failed to get event data "
58+
"for event %d: %d",
59+
i,
60+
QNN_GET_ERROR_CODE(error));
61+
return error;
62+
}
63+
// Check an event's sub events only if it relates to graph execution time
64+
// (and its sub events are the individual op executions):
65+
if (backend_->IsProfileEventTypeParentOfNodeTime(event_data.type)) {
66+
error = qnn_interface.qnn_profile_get_sub_events(
67+
events_ptr[i], &sub_events_ptr, &num_sub_events);
68+
if (error != QNN_SUCCESS) {
69+
QNN_EXECUTORCH_LOG_ERROR(
70+
"ProfileData failed to get sub events "
71+
"for event %d: %d",
72+
i,
73+
QNN_GET_ERROR_CODE(error));
74+
return error;
75+
}
76+
QnnProfile_EventData_t sub_event_data;
77+
for (std::uint32_t j = 0; j < num_sub_events; ++j) {
78+
error = qnn_interface.qnn_profile_get_event_data(
79+
sub_events_ptr[j], &sub_event_data);
80+
if (error != QNN_SUCCESS) {
81+
QNN_EXECUTORCH_LOG_ERROR(
82+
"ProfileData failed to get sub "
83+
"event data for sub event %d of event %d: %d",
84+
j,
85+
i,
86+
QNN_GET_ERROR_CODE(error));
87+
return error;
88+
}
89+
if (sub_event_data.type == QNN_PROFILE_EVENTTYPE_NODE &&
90+
(sub_event_data.unit == QNN_PROFILE_EVENTUNIT_MICROSEC ||
91+
sub_event_data.unit == QNN_PROFILE_EVENTUNIT_CYCLES)) {
92+
torch::executor::event_tracer_log_profiling_delegate(
93+
event_tracer,
94+
sub_event_data.identifier,
95+
/*delegate_debug_id=*/
96+
static_cast<torch::executor::DebugHandle>(-1),
97+
0,
98+
sub_event_data.value);
99+
}
100+
}
101+
}
102+
}
103+
return error;
104+
}
105+
106+
QnnProfile::~QnnProfile() {
107+
const QnnInterface& qnn_interface = implementation_.GetQnnInterface();
108+
if (handle_ != nullptr) {
109+
Qnn_ErrorHandle_t error = qnn_interface.qnn_profile_free(handle_);
110+
if (error != QNN_SUCCESS) {
111+
QNN_EXECUTORCH_LOG_ERROR(
112+
"Failed to free QNN profile_handle. Backend "
113+
"ID %u, error %d",
114+
qnn_interface.GetBackendId(),
115+
QNN_GET_ERROR_CODE(error));
116+
}
117+
handle_ = nullptr;
118+
}
119+
}
120+
} // namespace qnn
121+
} // namespace executor
122+
} // namespace torch

0 commit comments

Comments
 (0)