Skip to content

Commit 55c504d

Browse files
committed
Qualcomm AI Engine Direct - Enable zero copy feature
Summary: - Add argument "shared_buffer" into compiler_spec, qnn_executor_runner and test scripts - Actually, shared_buffer should be a runtime option since user are responsible to allocate memory for tensors on device. But it seems to have no way to set the runtime option to QnnBackend. Therefore, we put it to compile_spec for now. - Implement SharedBuffer to allocate and free RPC memory - Add QnnMemManger to register shared buffer for tensor - During exection time, we will register memory of tensor data for QNN. And we will deregister them during destruction time of QnnBackend - Add two API `void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment)` and `void QnnExecuTorchFreeCustomMem(void* buffer_ptr)` to allocate RPC memory with SharedBuffer - Users are responsible to allocate "enough" tensor bytes, and set alignment as MemoryAllocator::kDefaultAlignment. See runtime/core/memory_allocator.h.
1 parent 22cd0f8 commit 55c504d

31 files changed

+743
-56
lines changed

backends/qualcomm/CMakeLists.txt

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -118,26 +118,28 @@ include_directories(
118118
#
119119
# declare targets
120120
#
121+
add_library(executorch_backend INTERFACE)
121122
add_library(qcir INTERFACE qcir_schema_output)
122123
add_library(qcir_utils STATIC)
123-
add_library(qnn_schema INTERFACE ${_qnn_schema__outputs})
124-
add_library(executorch_backend INTERFACE)
124+
add_library(qnn_backend STATIC)
125+
add_library(qnn_backend_cache STATIC)
126+
add_library(qnn_context STATIC)
127+
add_library(qnn_device STATIC)
125128
add_library(qnn_executorch_backend SHARED)
126129
add_library(qnn_executorch_header INTERFACE)
127130
add_library(qnn_executorch_logging STATIC)
128-
add_library(qnn_manager STATIC)
131+
add_library(qnn_factory STATIC)
129132
add_library(qnn_function_interface INTERFACE)
133+
add_library(qnn_graph STATIC)
134+
add_library(qnn_header INTERFACE)
130135
add_library(qnn_implementation STATIC)
136+
add_library(qnn_logger STATIC)
137+
add_library(qnn_manager STATIC)
138+
add_library(qnn_mem_manager STATIC)
139+
add_library(qnn_schema INTERFACE ${_qnn_schema__outputs})
131140
add_library(qnn_sys_function_interface INTERFACE)
132141
add_library(qnn_sys_implementation STATIC)
133-
add_library(qnn_logger STATIC)
134-
add_library(qnn_device STATIC)
135-
add_library(qnn_context STATIC)
136-
add_library(qnn_backend_cache STATIC)
137-
add_library(qnn_graph STATIC)
138-
add_library(qnn_backend STATIC)
139-
add_library(qnn_factory STATIC)
140-
add_library(qnn_header INTERFACE)
142+
add_library(shared_buffer STATIC)
141143
add_library(wrappers STATIC)
142144
add_library(utils STATIC)
143145

@@ -214,6 +216,13 @@ target_link_libraries(qnn_graph
214216
qnn_implementation
215217
qnn_context
216218
)
219+
target_link_libraries(qnn_mem_manager
220+
PRIVATE
221+
qnn_executorch_logging
222+
qnn_implementation
223+
qnn_context
224+
)
225+
217226
target_link_libraries(qnn_factory
218227
PUBLIC
219228
qnn_header
@@ -223,13 +232,15 @@ target_link_libraries(qnn_factory
223232
qnn_device
224233
qnn_context
225234
qnn_graph
235+
qnn_mem_manager
226236
)
227237
target_link_libraries(qnn_manager
228238
PRIVATE
229239
qnn_factory
230240
wrappers
231241
qnn_schema
232242
utils
243+
shared_buffer
233244
)
234245
target_link_libraries(qnn_executorch_backend
235246
PRIVATE
@@ -243,7 +254,11 @@ target_link_libraries(utils
243254
PRIVATE
244255
qnn_executorch_logging
245256
)
246-
257+
target_link_libraries(shared_buffer
258+
PRIVATE
259+
qnn_executorch_logging
260+
${CMAKE_DL_LIBS}
261+
)
247262
#
248263
# add linker option
249264
#

backends/qualcomm/aot/wrappers/TensorWrapper.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ TensorWrapper::TensorWrapper(
105105

106106
Error TensorWrapper::FillDataBuffer(const void* data, bool copy_data) {
107107
if (data != nullptr) {
108+
QNN_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_RAW;
108109
QNN_VER_PTR(tensor_)->clientBuf.dataSize = bytes_;
109110
if (copy_data) {
110111
owned_data_ = std::make_unique<char[]>(bytes_);
@@ -144,6 +145,12 @@ Error TensorWrapper::SetName(const std::string& name) {
144145
return Error::Ok;
145146
}
146147

148+
Error TensorWrapper::SetMemHandle(Qnn_MemHandle_t mem_handle) {
149+
QNN_VER_PTR(tensor_)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
150+
QNN_VER_PTR(tensor_)->memHandle = mem_handle;
151+
return Error::Ok;
152+
}
153+
147154
// base function for Create TensorWrapper
148155
std::shared_ptr<TensorWrapper> CreateTensorWrapper(
149156
const std::string& tensor_name,

backends/qualcomm/aot/wrappers/TensorWrapper.h

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,16 +59,38 @@ class TensorWrapper {
5959
return QNN_VER_PTR(tensor_)->type == QNN_TENSOR_TYPE_STATIC;
6060
};
6161

62-
const void* GetStaticTensorData() const {
63-
return QNN_VER_PTR(tensor_)->clientBuf.data;
62+
std::uint32_t* GetDims() const {
63+
return QNN_VER_PTR(tensor_)->dimensions;
64+
};
65+
66+
Qnn_DataType_t GetDataType() const {
67+
return QNN_VER_PTR(tensor_)->dataType;
68+
};
69+
70+
Qnn_MemHandle_t const GetMemHandle() {
71+
return QNN_VER_PTR(tensor_)->memHandle;
72+
};
73+
74+
Qnn_TensorMemType_t GetMemType() const {
75+
return QNN_VER_PTR(tensor_)->memType;
6476
};
6577

6678
std::string GetName() const {
6779
return qnn_tensor_name_;
6880
};
6981

82+
std::uint32_t GetRank() const {
83+
return QNN_VER_PTR(tensor_)->rank;
84+
};
85+
86+
const void* GetStaticTensorData() const {
87+
return QNN_VER_PTR(tensor_)->clientBuf.data;
88+
};
89+
7090
Error SetName(const std::string& name);
7191

92+
Error SetMemHandle(Qnn_MemHandle_t mem_handle);
93+
7294
private:
7395
// need this to handle QNN_TENSOR_ERROR_NAME_HASH_COLLISION
7496
std::string qnn_tensor_name_;

backends/qualcomm/runtime/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,10 @@ target_sources(utils
4747
PRIVATE
4848
${CMAKE_CURRENT_LIST_DIR}/Utils.cpp
4949
)
50+
51+
# shared_buffer
52+
target_sources(shared_buffer
53+
PRIVATE
54+
${CMAKE_CURRENT_LIST_DIR}/SharedBuffer.h
55+
${CMAKE_CURRENT_LIST_DIR}/SharedBuffer.cpp
56+
)

backends/qualcomm/runtime/QnnExecuTorch.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,16 @@ typedef struct {
3131
}
3232
// clang-format on
3333

34+
/// Allocate specific tensors (usually graph inputs and outputs) on shared
35+
/// memory. Users are responsible to allocate "enough" tensor bytes, and set
36+
/// alignment as MemoryAllocator::kDefaultAlignment.
37+
/// See runtime/core/memory_allocator.h. The function returns a valid pointer
38+
/// if allocation is successful.
39+
void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment);
40+
41+
/// Free the allocated shared memory.
42+
void QnnExecuTorchFreeCustomMem(void* buffer_ptr);
43+
3444
#ifdef __cplusplus
3545
}
3646
#endif // __cplusplus

backends/qualcomm/runtime/QnnExecuTorchBackend.cpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -188,19 +188,27 @@ Error QnnExecuTorchBackend::execute(
188188
std::vector<Qnn_Tensor_t> input_tensor_structs;
189189
std::vector<Qnn_Tensor_t> output_tensor_structs;
190190

191+
input_tensor_structs.reserve(input_tensors.size());
191192
for (int i = 0; i < input_tensors.size(); ++i) {
192-
input_tensors[i]->FillDataBuffer(
193-
args[i]->toTensor().const_data_ptr(), true /* copy_data */);
193+
if (qnn_manager->RegisterMem(
194+
args[i]->toTensor().mutable_data_ptr(), input_tensors[i]) !=
195+
Error::Ok) {
196+
input_tensors[i]->FillDataBuffer(
197+
args[i]->toTensor().const_data_ptr(), true /* copy_data */);
198+
}
194199
input_tensor_structs.push_back(input_tensors[i]->CloneTensorStruct());
195200
}
196201

197202
int output_index = input_tensors.size();
198203
for (const auto& output_tensor : output_tensors) {
199204
// pos=0 limits the search to the prefix
200205
if (output_tensor->GetName().rfind("output_", 0) == 0) {
201-
output_tensor->FillDataBuffer(
202-
args[output_index]->toTensor().mutable_data_ptr(),
203-
false /* copy_data */);
206+
void* mutable_data_ptr =
207+
args[output_index]->toTensor().mutable_data_ptr();
208+
if (qnn_manager->RegisterMem(mutable_data_ptr, output_tensor) !=
209+
Error::Ok) {
210+
output_tensor->FillDataBuffer(mutable_data_ptr, false /* copy_data */);
211+
}
204212
output_index++;
205213
}
206214
output_tensor_structs.push_back(output_tensor->CloneTensorStruct());

backends/qualcomm/runtime/QnnManager.cpp

Lines changed: 62 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88
#include <executorch/backends/qualcomm/runtime/QnnManager.h>
9+
#include <executorch/backends/qualcomm/runtime/SharedBuffer.h>
910
#include <executorch/backends/qualcomm/runtime/Utils.h>
1011
#include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
11-
1212
#include <cstdlib>
1313
#include <cstring>
1414
#include <fstream>
@@ -51,7 +51,9 @@ QnnManager::QnnManager(
5151
"the size of qnn context binary: %d",
5252
qnn_executorch_context_binary.nbytes);
5353
QNN_EXECUTORCH_LOG_INFO(
54-
"Is on-device graph construction: %d", options_->online_prepare());
54+
"Is on-device graph construction: %d", options->online_prepare());
55+
QNN_EXECUTORCH_LOG_INFO(
56+
"Enable shared buffer: %d", options->shared_buffer());
5557
}
5658

5759
if (library_path.empty()) {
@@ -79,6 +81,53 @@ Error QnnManager::LoadQnnLibrary() {
7981
return ret;
8082
}
8183

84+
Error QnnManager::RegisterMem(
85+
void* data_ptr,
86+
const std::shared_ptr<TensorWrapper>& tensor_wrapper) {
87+
SharedBuffer& shared_buffer_manager = SharedBuffer::GetSharedBufferManager();
88+
// Not enable shared buffer
89+
if (!options_->shared_buffer())
90+
return Error::Internal;
91+
92+
if (backend_params_ptr_->qnn_mem_manager_ptr_ == nullptr) {
93+
QNN_EXECUTORCH_LOG_WARN(
94+
"Backend %s doesn't supported shared buffer.",
95+
EnumNameQnnExecuTorchBackendType(
96+
options_->backend_options()->backend_type()));
97+
return Error::Internal;
98+
}
99+
100+
if (!shared_buffer_manager.IsAllocated(data_ptr)) {
101+
// It means two scenarios here:
102+
// 1. the input and output partitioned graph
103+
// 2. Actually, user doesn't allocate shared buffer with
104+
// QnnExecuTorchAllocCustomMem API
105+
return Error::Internal;
106+
} else if (backend_params_ptr_->qnn_mem_manager_ptr_->IsRegistered(
107+
tensor_wrapper->GetMemHandle())) {
108+
if (options_->log_level() >= QnnExecuTorchLogLevel::kLogLevelInfo)
109+
QNN_EXECUTORCH_LOG_INFO(
110+
"Tensor name %s has been registered shared memory.",
111+
tensor_wrapper->GetName().c_str());
112+
return Error::Ok;
113+
}
114+
115+
int32_t mem_fd = SharedBuffer::GetSharedBufferManager().MemToFd(data_ptr);
116+
if (mem_fd == -1) {
117+
QNN_EXECUTORCH_LOG_WARN(
118+
"Tensor name %s is failed to get file descriptor.",
119+
tensor_wrapper->GetName().c_str());
120+
return Error::Internal;
121+
}
122+
ET_CHECK_OR_RETURN_ERROR(
123+
backend_params_ptr_->qnn_mem_manager_ptr_->RegisterMem(
124+
tensor_wrapper, mem_fd) == Error::Ok,
125+
Internal,
126+
"Fail to register to shared memory.");
127+
128+
return Error::Ok;
129+
}
130+
82131
Error QnnManager::Init() {
83132
ET_CHECK_OR_RETURN_ERROR(
84133
LoadQnnLibrary() == Error::Ok, Internal, "Fail to load Qnn library");
@@ -202,14 +251,6 @@ void QnnManager::Destroy() {
202251
qnn_loaded_backend_.TerminateAllBackends();
203252
}
204253

205-
bool QnnManager::IsAvailable() {
206-
return true;
207-
}
208-
209-
bool QnnManager::IsOnlinePrepare() {
210-
return options_->online_prepare();
211-
}
212-
213254
bool QnnManager::IsNodeSupportedByBackend(
214255
std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) {
215256
Qnn_ErrorHandle_t error = QNN_SUCCESS;
@@ -312,3 +353,14 @@ Error QnnManager::Compile(
312353
} // namespace qnn
313354
} // namespace executor
314355
} // namespace torch
356+
void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment) {
357+
using torch::executor::qnn::SharedBuffer;
358+
void* buffer_ptr =
359+
SharedBuffer::GetSharedBufferManager().AllocMem(bytes, alignment);
360+
return buffer_ptr;
361+
}
362+
363+
void QnnExecuTorchFreeCustomMem(void* buffer_ptr) {
364+
using torch::executor::qnn::SharedBuffer;
365+
SharedBuffer::GetSharedBufferManager().FreeMem(buffer_ptr);
366+
}

backends/qualcomm/runtime/QnnManager.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,21 +40,29 @@ class QnnManager {
4040

4141
void Destroy();
4242

43-
bool IsAvailable();
43+
bool IsAvailable() {
44+
return true;
45+
}
46+
47+
bool IsOnlinePrepare() {
48+
return options_->online_prepare();
49+
}
4450

4551
bool IsTensorDump() {
4652
return options_->tensor_dump_output_path()->size() > 0;
4753
}
4854

49-
bool IsOnlinePrepare();
50-
5155
bool IsNodeSupportedByBackend(
5256
std::vector<std::shared_ptr<OpWrapper>>& op_wrappers);
5357

5458
Error Compile(
5559
std::vector<std::shared_ptr<OpWrapper>>& op_wrappers,
5660
QnnExecuTorchContextBinary& qnn_executorch_context_binary);
5761

62+
Error RegisterMem(
63+
void* data_ptr,
64+
const std::shared_ptr<TensorWrapper>& tensor_wrapper);
65+
5866
std::vector<std::shared_ptr<TensorWrapper>> GetGraphInputs() {
5967
return input_tensors_;
6068
}

0 commit comments

Comments
 (0)