Skip to content

Commit a531ca5

Browse files
shewu-quicfacebook-github-bot
authored andcommitted
Qualcomm AI Engine Direct - Enable zero copy feature (#2531)
Summary: - Add argument "shared_buffer" into compiler_spec, qnn_executor_runner and test scripts - Actually, shared_buffer should be a runtime option since user are responsible to allocate memory for tensors on device. But it seems to have no way to set the runtime option to QnnBackend. Therefore, we put it to compile_spec for now. - Implement SharedBuffer to allocate and free RPC memory - Add QnnMemManger to register shared buffer for tensor - During exection time, we will register memory of tensor data for QNN. And we will deregister them during destruction time of QnnBackend - Add two API `void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment)` and `void QnnExecuTorchFreeCustomMem(void* buffer_ptr)` to allocate RPC memory with SharedBuffer - Users are responsible to allocate "enough" tensor bytes, and set alignment as MemoryAllocator::kDefaultAlignment. See runtime/core/memory_allocator.h. Pull Request resolved: #2531 Reviewed By: kirklandsign Differential Revision: D55142607 Pulled By: cccclai fbshipit-source-id: 6a394ef0023e70362c3cf963a0519c3efaa95bc2
1 parent 9e922d3 commit a531ca5

33 files changed

+751
-71
lines changed

backends/qualcomm/CMakeLists.txt

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -118,27 +118,29 @@ include_directories(
118118
#
119119
# declare targets
120120
#
121+
add_library(executorch_backend INTERFACE)
121122
add_library(qcir INTERFACE qcir_schema_output)
122123
add_library(qcir_utils STATIC)
123-
add_library(qnn_schema INTERFACE ${_qnn_schema__outputs})
124-
add_library(executorch_backend INTERFACE)
124+
add_library(qnn_backend STATIC)
125+
add_library(qnn_backend_cache STATIC)
126+
add_library(qnn_context STATIC)
127+
add_library(qnn_device STATIC)
125128
add_library(qnn_executorch_backend SHARED)
126129
add_library(qnn_executorch_header INTERFACE)
127130
add_library(qnn_executorch_logging STATIC)
128-
add_library(qnn_manager STATIC)
131+
add_library(qnn_factory STATIC)
129132
add_library(qnn_function_interface INTERFACE)
133+
add_library(qnn_graph STATIC)
134+
add_library(qnn_header INTERFACE)
130135
add_library(qnn_implementation STATIC)
131-
add_library(qnn_sys_function_interface INTERFACE)
132-
add_library(qnn_sys_implementation STATIC)
133136
add_library(qnn_logger STATIC)
137+
add_library(qnn_manager STATIC)
138+
add_library(qnn_mem_manager STATIC)
134139
add_library(qnn_profiler STATIC)
135-
add_library(qnn_device STATIC)
136-
add_library(qnn_context STATIC)
137-
add_library(qnn_backend_cache STATIC)
138-
add_library(qnn_graph STATIC)
139-
add_library(qnn_backend STATIC)
140-
add_library(qnn_factory STATIC)
141-
add_library(qnn_header INTERFACE)
140+
add_library(qnn_schema INTERFACE ${_qnn_schema__outputs})
141+
add_library(qnn_sys_function_interface INTERFACE)
142+
add_library(qnn_sys_implementation STATIC)
143+
add_library(shared_buffer STATIC)
142144
add_library(wrappers STATIC)
143145
add_library(utils STATIC)
144146

@@ -220,6 +222,13 @@ target_link_libraries(qnn_graph
220222
qnn_context
221223
qnn_profiler
222224
)
225+
target_link_libraries(qnn_mem_manager
226+
PRIVATE
227+
qnn_executorch_logging
228+
qnn_implementation
229+
qnn_context
230+
)
231+
223232
target_link_libraries(qnn_factory
224233
PUBLIC
225234
qnn_header
@@ -229,13 +238,15 @@ target_link_libraries(qnn_factory
229238
qnn_device
230239
qnn_context
231240
qnn_graph
241+
qnn_mem_manager
232242
)
233243
target_link_libraries(qnn_manager
234244
PRIVATE
235245
qnn_factory
236246
wrappers
237247
qnn_schema
238248
utils
249+
shared_buffer
239250
)
240251
target_link_libraries(qnn_executorch_backend
241252
PRIVATE
@@ -249,7 +260,11 @@ target_link_libraries(utils
249260
PRIVATE
250261
qnn_executorch_logging
251262
)
252-
263+
target_link_libraries(shared_buffer
264+
PRIVATE
265+
qnn_executorch_logging
266+
${CMAKE_DL_LIBS}
267+
)
253268
#
254269
# add linker option
255270
#

backends/qualcomm/aot/wrappers/TensorWrapper.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ TensorWrapper::TensorWrapper(
105105

106106
Error TensorWrapper::FillDataBuffer(const void* data, bool copy_data) {
107107
if (data != nullptr) {
108+
QNN_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_RAW;
108109
QNN_VER_PTR(tensor_)->clientBuf.dataSize = bytes_;
109110
if (copy_data) {
110111
owned_data_ = std::make_unique<char[]>(bytes_);
@@ -144,6 +145,12 @@ Error TensorWrapper::SetName(const std::string& name) {
144145
return Error::Ok;
145146
}
146147

148+
Error TensorWrapper::SetMemHandle(Qnn_MemHandle_t mem_handle) {
149+
QNN_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
150+
QNN_VER_PTR(tensor_)->memHandle = mem_handle;
151+
return Error::Ok;
152+
}
153+
147154
// base function for Create TensorWrapper
148155
std::shared_ptr<TensorWrapper> CreateTensorWrapper(
149156
const std::string& tensor_name,

backends/qualcomm/aot/wrappers/TensorWrapper.h

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,16 +59,38 @@ class TensorWrapper {
5959
return QNN_VER_PTR(tensor_)->type == QNN_TENSOR_TYPE_STATIC;
6060
};
6161

62-
const void* GetStaticTensorData() const {
63-
return QNN_VER_PTR(tensor_)->clientBuf.data;
62+
std::uint32_t* GetDims() const {
63+
return QNN_VER_PTR(tensor_)->dimensions;
64+
};
65+
66+
Qnn_DataType_t GetDataType() const {
67+
return QNN_VER_PTR(tensor_)->dataType;
68+
};
69+
70+
Qnn_MemHandle_t const GetMemHandle() {
71+
return QNN_VER_PTR(tensor_)->memHandle;
72+
};
73+
74+
Qnn_TensorMemType_t GetMemType() const {
75+
return QNN_VER_PTR(tensor_)->memType;
6476
};
6577

6678
std::string GetName() const {
6779
return qnn_tensor_name_;
6880
};
6981

82+
std::uint32_t GetRank() const {
83+
return QNN_VER_PTR(tensor_)->rank;
84+
};
85+
86+
const void* GetStaticTensorData() const {
87+
return QNN_VER_PTR(tensor_)->clientBuf.data;
88+
};
89+
7090
Error SetName(const std::string& name);
7191

92+
Error SetMemHandle(Qnn_MemHandle_t mem_handle);
93+
7294
private:
7395
// need this to handle QNN_TENSOR_ERROR_NAME_HASH_COLLISION
7496
std::string qnn_tensor_name_;

backends/qualcomm/passes/insert_io_qdq.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ def _ceate_args(self, target: torch.fx.node.Target, quant_attrs: Dict):
3838
arg_schemas = list(target._schema.arguments)[1:]
3939
for arg_schema in arg_schemas:
4040
name = arg_schema.name
41+
# TODO: Due to the new parameter "out_dtype" in the dequantize node,
42+
# it could not be found in the quant_attrs of other nodes,
43+
# and it will cause a key error. For now, the output type
44+
# of our dequantize node is only float. (by default in pytorch)
45+
if name == "out_dtype":
46+
continue
4147
value = quant_attrs[name]
4248
if type(arg_schema.type) == torch.tensor and type(value) in [int, float]:
4349
value = torch.tensor(value)

backends/qualcomm/runtime/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,10 @@ target_sources(utils
4747
PRIVATE
4848
${CMAKE_CURRENT_LIST_DIR}/Utils.cpp
4949
)
50+
51+
# shared_buffer
52+
target_sources(shared_buffer
53+
PRIVATE
54+
${CMAKE_CURRENT_LIST_DIR}/SharedBuffer.h
55+
${CMAKE_CURRENT_LIST_DIR}/SharedBuffer.cpp
56+
)

backends/qualcomm/runtime/QnnExecuTorch.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@
88
#pragma once
99

1010
#ifdef __cplusplus
11+
#include <cstddef>
1112
#include <cstdint>
1213
#else
14+
#include <stddef.h>
1315
#include <stdint.h>
1416
#endif
1517

@@ -31,6 +33,16 @@ typedef struct {
3133
}
3234
// clang-format on
3335

36+
/// Allocate specific tensors (usually graph inputs and outputs) on shared
37+
/// memory. Users are responsible to allocate "enough" tensor bytes, and set
38+
/// alignment as MemoryAllocator::kDefaultAlignment.
39+
/// See runtime/core/memory_allocator.h. The function returns a valid pointer
40+
/// if allocation is successful.
41+
void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment);
42+
43+
/// Free the allocated shared memory.
44+
void QnnExecuTorchFreeCustomMem(void* buffer_ptr);
45+
3446
#ifdef __cplusplus
3547
}
3648
#endif // __cplusplus

backends/qualcomm/runtime/QnnExecuTorchBackend.cpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -188,19 +188,27 @@ Error QnnExecuTorchBackend::execute(
188188
std::vector<Qnn_Tensor_t> input_tensor_structs;
189189
std::vector<Qnn_Tensor_t> output_tensor_structs;
190190

191+
input_tensor_structs.reserve(input_tensors.size());
191192
for (int i = 0; i < input_tensors.size(); ++i) {
192-
input_tensors[i]->FillDataBuffer(
193-
args[i]->toTensor().const_data_ptr(), true /* copy_data */);
193+
if (qnn_manager->RegisterMem(
194+
args[i]->toTensor().mutable_data_ptr(), input_tensors[i]) !=
195+
Error::Ok) {
196+
input_tensors[i]->FillDataBuffer(
197+
args[i]->toTensor().const_data_ptr(), true /* copy_data */);
198+
}
194199
input_tensor_structs.push_back(input_tensors[i]->CloneTensorStruct());
195200
}
196201

197202
int output_index = input_tensors.size();
198203
for (const auto& output_tensor : output_tensors) {
199204
// pos=0 limits the search to the prefix
200205
if (output_tensor->GetName().rfind("output_", 0) == 0) {
201-
output_tensor->FillDataBuffer(
202-
args[output_index]->toTensor().mutable_data_ptr(),
203-
false /* copy_data */);
206+
void* mutable_data_ptr =
207+
args[output_index]->toTensor().mutable_data_ptr();
208+
if (qnn_manager->RegisterMem(mutable_data_ptr, output_tensor) !=
209+
Error::Ok) {
210+
output_tensor->FillDataBuffer(mutable_data_ptr, false /* copy_data */);
211+
}
204212
output_index++;
205213
}
206214
output_tensor_structs.push_back(output_tensor->CloneTensorStruct());

backends/qualcomm/runtime/QnnManager.cpp

Lines changed: 62 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88
#include <executorch/backends/qualcomm/runtime/QnnManager.h>
9+
#include <executorch/backends/qualcomm/runtime/SharedBuffer.h>
910
#include <executorch/backends/qualcomm/runtime/Utils.h>
1011
#include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
11-
1212
#include <cstdlib>
1313
#include <cstring>
1414
#include <fstream>
@@ -54,7 +54,9 @@ QnnManager::QnnManager(
5454
"the size of qnn context binary: %d",
5555
qnn_executorch_context_binary.nbytes);
5656
QNN_EXECUTORCH_LOG_INFO(
57-
"Is on-device graph construction: %d", options_->online_prepare());
57+
"Is on-device graph construction: %d", options->online_prepare());
58+
QNN_EXECUTORCH_LOG_INFO(
59+
"Enable shared buffer: %d", options->shared_buffer());
5860
}
5961

6062
if (library_path.empty()) {
@@ -82,6 +84,53 @@ Error QnnManager::LoadQnnLibrary() {
8284
return ret;
8385
}
8486

87+
Error QnnManager::RegisterMem(
88+
void* data_ptr,
89+
const std::shared_ptr<TensorWrapper>& tensor_wrapper) {
90+
SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager();
91+
// Not enable shared buffer
92+
if (!options_->shared_buffer())
93+
return Error::Internal;
94+
95+
if (backend_params_ptr_->qnn_mem_manager_ptr_ == nullptr) {
96+
QNN_EXECUTORCH_LOG_WARN(
97+
"Backend %s doesn't supported shared buffer.",
98+
EnumNameQnnExecuTorchBackendType(
99+
options_->backend_options()->backend_type()));
100+
return Error::Internal;
101+
}
102+
103+
if (!shared_buffer_manager.IsAllocated(data_ptr)) {
104+
// It means two scenarios here:
105+
// 1. the input and output partitioned graph
106+
// 2. Actually, user doesn't allocate shared buffer with
107+
// QnnExecuTorchAllocCustomMem API
108+
return Error::Internal;
109+
} else if (backend_params_ptr_->qnn_mem_manager_ptr_->IsRegistered(
110+
tensor_wrapper->GetMemHandle())) {
111+
if (options_->log_level() >= QnnExecuTorchLogLevel::kLogLevelInfo)
112+
QNN_EXECUTORCH_LOG_INFO(
113+
"Tensor name %s has been registered shared memory.",
114+
tensor_wrapper->GetName().c_str());
115+
return Error::Ok;
116+
}
117+
118+
int32_t mem_fd = SharedBuffer::GetSharedBufferManager().MemToFd(data_ptr);
119+
if (mem_fd == -1) {
120+
QNN_EXECUTORCH_LOG_WARN(
121+
"Tensor name %s is failed to get file descriptor.",
122+
tensor_wrapper->GetName().c_str());
123+
return Error::Internal;
124+
}
125+
ET_CHECK_OR_RETURN_ERROR(
126+
backend_params_ptr_->qnn_mem_manager_ptr_->RegisterMem(
127+
tensor_wrapper, mem_fd) == Error::Ok,
128+
Internal,
129+
"Fail to register to shared memory.");
130+
131+
return Error::Ok;
132+
}
133+
85134
Error QnnManager::Init() {
86135
ET_CHECK_OR_RETURN_ERROR(
87136
LoadQnnLibrary() == Error::Ok, Internal, "Fail to load Qnn library");
@@ -219,14 +268,6 @@ void QnnManager::Destroy() {
219268
qnn_loaded_backend_.TerminateAllBackends();
220269
}
221270

222-
bool QnnManager::IsAvailable() {
223-
return true;
224-
}
225-
226-
bool QnnManager::IsOnlinePrepare() {
227-
return options_->online_prepare();
228-
}
229-
230271
bool QnnManager::IsNodeSupportedByBackend(
231272
std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) {
232273
Qnn_ErrorHandle_t error = QNN_SUCCESS;
@@ -329,3 +370,14 @@ Error QnnManager::Compile(
329370
} // namespace qnn
330371
} // namespace executor
331372
} // namespace torch
373+
void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment) {
374+
using torch::executor::qnn::SharedBuffer;
375+
void* buffer_ptr =
376+
SharedBuffer::GetSharedBufferManager().AllocMem(bytes, alignment);
377+
return buffer_ptr;
378+
}
379+
380+
void QnnExecuTorchFreeCustomMem(void* buffer_ptr) {
381+
using torch::executor::qnn::SharedBuffer;
382+
SharedBuffer::GetSharedBufferManager().FreeMem(buffer_ptr);
383+
}

backends/qualcomm/runtime/QnnManager.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,21 +42,29 @@ class QnnManager {
4242

4343
void Destroy();
4444

45-
bool IsAvailable();
45+
bool IsAvailable() {
46+
return true;
47+
}
48+
49+
bool IsOnlinePrepare() {
50+
return options_->online_prepare();
51+
}
4652

4753
bool IsTensorDump() {
4854
return options_->tensor_dump_output_path()->size() > 0;
4955
}
5056

51-
bool IsOnlinePrepare();
52-
5357
bool IsNodeSupportedByBackend(
5458
std::vector<std::shared_ptr<OpWrapper>>& op_wrappers);
5559

5660
Error Compile(
5761
std::vector<std::shared_ptr<OpWrapper>>& op_wrappers,
5862
QnnExecuTorchContextBinary& qnn_executorch_context_binary);
5963

64+
Error RegisterMem(
65+
void* data_ptr,
66+
const std::shared_ptr<TensorWrapper>& tensor_wrapper);
67+
6068
std::vector<std::shared_ptr<TensorWrapper>> GetGraphInputs() {
6169
return input_tensors_;
6270
}

0 commit comments

Comments
 (0)