Skip to content

Commit c2adfa9

Browse files
committed
Remove flatbuffer64 and define our own protocol
1 parent c71fe08 commit c2adfa9

20 files changed

+489
-263
lines changed

backends/qualcomm/CMakeLists.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@ include_directories(
7676

7777
set(_qnn_schema__srcs
7878
backends/qualcomm/serialization/qc_compiler_spec.fbs
79-
backends/qualcomm/serialization/qc_binary_info.fbs
8079
)
8180
set(_qnn_schema__include_dir "${CMAKE_BINARY_DIR}/schema/include")
8281
# Paths to headers generated from the .fbs files.
@@ -116,6 +115,7 @@ add_library(qcir_utils STATIC)
116115
add_library(qnn_backend STATIC)
117116
add_library(qnn_backend_cache STATIC)
118117
add_library(qnn_context STATIC)
118+
add_library(qnn_custom_protocol STATIC)
119119
add_library(qnn_device STATIC)
120120
add_library(qnn_executorch_backend SHARED)
121121
add_library(qnn_executorch_header INTERFACE)
@@ -155,6 +155,7 @@ target_link_libraries(qnn_executorch_logging PRIVATE qnn_schema)
155155
target_link_libraries(qnn_profiler PRIVATE qnn_executorch_logging)
156156
target_link_libraries(qnn_logger PRIVATE qnn_implementation ${android_log})
157157
target_link_libraries(qnn_backend PRIVATE qnn_implementation qnn_logger)
158+
target_link_libraries(qnn_custom_protocol PRIVATE qcir_utils)
158159
target_link_libraries(
159160
qnn_device PRIVATE qnn_executorch_logging qnn_implementation qnn_logger
160161
)
@@ -177,7 +178,7 @@ target_link_libraries(
177178
qnn_factory
178179
PUBLIC qnn_header
179180
PRIVATE qnn_schema qnn_backend qnn_device qnn_context qnn_graph
180-
qnn_mem_manager
181+
qnn_mem_manager qnn_custom_protocol
181182
)
182183
target_link_libraries(
183184
qnn_manager PRIVATE qnn_factory wrappers qnn_schema utils shared_buffer

backends/qualcomm/aot/python/PyQnnManagerAdaptor.h

Lines changed: 73 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@
88
#pragma once
99
#include <executorch/backends/qualcomm/aot/ir/qcir_utils.h>
1010
#include <executorch/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h>
11-
#include <executorch/backends/qualcomm/qc_binary_info_generated.h>
1211
#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
1312
#include <executorch/backends/qualcomm/runtime/Logging.h>
1413
#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
1514
#include <executorch/backends/qualcomm/runtime/QnnManager.h>
15+
#include <executorch/backends/qualcomm/runtime/backends/QnnCustomProtocol.h>
1616
#include <pybind11/numpy.h>
1717
#include <pybind11/pybind11.h>
1818
#include <pybind11/stl.h>
@@ -50,46 +50,64 @@ class PyQnnManager {
5050
qnn_executorch_options, qnn_executorch_context_binary_);
5151
}
5252

53-
// used for loading multiple graphs in qcir
53+
// used during stage 2 of multi-graph mode
5454
explicit PyQnnManager(const py::bytes& buffer, const py::list& qcirs)
5555
: qnn_executorch_option_ptr_(buffer) {
5656
auto qnn_executorch_options = GetQnnExecuTorchOptions(
5757
qnn_executorch_option_ptr_.cast<std::string_view>().data());
5858

5959
// merge multiple qcirs into one context with multiple graphs
6060

61-
// this makes it easier to do subtraction for offsets
61+
// We start retrieving tensor from offsets = 0.
6262
std::vector<uint32_t> offsets(1, 0);
63-
std::vector<const flatbuffers::Vector64<uint8_t>*> tensor_data;
64-
fb_opt_.max_size = FLATBUFFERS_MAX_64_BUFFER_SIZE;
63+
std::vector<uint8_t> tensor_data;
64+
std::vector<uint8_t*> tensor_ptr;
65+
std::vector<uint64_t> tensor_size;
66+
uint64_t total_tensor_size = 0;
6567
for (size_t i = 0; i < qcirs.size(); ++i) {
6668
py::buffer_info info(py::buffer(qcirs[i].cast<py::bytes>()).request());
67-
flatbuffers::Verifier verifier_binary_info(
68-
static_cast<const uint8_t* const>(info.ptr),
69-
info.size * info.itemsize,
70-
fb_opt_);
71-
if (!qnn_delegate::VerifyBinaryInfoBuffer(verifier_binary_info)) {
72-
QNN_EXECUTORCH_LOG_ERROR("Fail to verify binary info");
73-
return;
74-
}
75-
auto binary_info = qnn_delegate::GetBinaryInfo(info.ptr);
76-
tensor_data.push_back(binary_info->tensor_data());
77-
78-
flatbuffers::Verifier verifier_qcir(
79-
binary_info->context_data()->Data(),
80-
binary_info->context_data()->size());
81-
if (!qcir::VerifyContextBuffer(verifier_qcir)) {
82-
QNN_EXECUTORCH_LOG_ERROR("Fail to verify qcir format");
69+
70+
uint8_t* qcir_custom_buffer_ptr = static_cast<uint8_t*>(info.ptr);
71+
QnnQcirCustomProtocol qnn_qcir_custom_protocol;
72+
auto [status, _, qcir_tensor_size, __, qcir_tensor_ptr] =
73+
qnn_qcir_custom_protocol.DeserializeQcirCustomBuffer(
74+
qcir_custom_buffer_ptr);
75+
76+
if (status != Error::Ok) {
77+
QNN_EXECUTORCH_LOG_ERROR("Fail to verify QnnQcirCustomProtocol");
8378
return;
8479
}
85-
offsets.push_back(offsets.back() + binary_info->tensor_data()->size());
80+
81+
tensor_ptr.push_back(static_cast<uint8_t*>(qcir_tensor_ptr));
82+
tensor_size.push_back(qcir_tensor_size);
83+
total_tensor_size += qcir_tensor_size;
84+
offsets.push_back(offsets.back() + qcir_tensor_size);
85+
}
86+
87+
tensor_data.resize(total_tensor_size);
88+
89+
// store multiple graphs tensor in a contiguous memory space
90+
for (size_t i = 0; i < tensor_ptr.size(); ++i) {
91+
std::memcpy(
92+
tensor_data.data() + offsets[i], tensor_ptr[i], tensor_size[i]);
8693
}
8794

8895
std::vector<flatbuffers::Offset<qcir::Graph>> graphs;
8996
for (size_t i = 0; i < qcirs.size(); ++i) {
9097
py::buffer_info info(py::buffer(qcirs[i].cast<py::bytes>()).request());
91-
auto binary_info = qnn_delegate::GetBinaryInfo(info.ptr);
92-
auto context = qcir::GetContext(binary_info->context_data()->Data());
98+
99+
uint8_t* qcir_custom_buffer_ptr = static_cast<uint8_t*>(info.ptr);
100+
QnnQcirCustomProtocol qnn_qcir_custom_protocol;
101+
auto [status, qcir_fbs_size, _, qcir_fbs_ptr, __] =
102+
qnn_qcir_custom_protocol.DeserializeQcirCustomBuffer(
103+
qcir_custom_buffer_ptr);
104+
105+
if (status != Error::Ok) {
106+
QNN_EXECUTORCH_LOG_ERROR("Fail to verify QnnQcirCustomProtocol");
107+
return;
108+
}
109+
110+
auto context = qcir::GetContext(qcir_fbs_ptr);
93111
for (const auto& graph : *context->graphs()) {
94112
std::vector<flatbuffers::Offset<qcir::Tensor>> tensors;
95113
for (const auto tensor : *graph->tensors()) {
@@ -138,7 +156,9 @@ class PyQnnManager {
138156
QnnExecuTorchContextBinary qcir_bin(
139157
{builder_.GetBufferPointer(), builder_.GetSize()});
140158

141-
qnn_executorch_context_binary_ = MakeBinaryInfo(qcir_bin, tensor_data);
159+
// Init QnnQcirCustomProtocol binary
160+
qnn_executorch_context_binary_ =
161+
MakeQcirCustomBinaryInfo(qcir_bin, tensor_data);
142162
qnn_manager_ = std::make_shared<QnnManager>(
143163
qnn_executorch_options, qnn_executorch_context_binary_);
144164
}
@@ -152,7 +172,7 @@ class PyQnnManager {
152172
return qnn_manager_->IsNodeSupportedByBackend(op_wrappers);
153173
}
154174

155-
// this method is specific for compiling multi-graphs
175+
// this method is specific for stage 2 of compiling multi-graphs
156176
py::array_t<char> Compile() {
157177
if (qnn_manager_->CompileQcir() != Error::Ok) {
158178
QNN_EXECUTORCH_LOG_ERROR("Fail to compile qcir");
@@ -271,7 +291,13 @@ class PyQnnManager {
271291

272292
QnnExecuTorchContextBinary qcir_binary(
273293
{builder_.GetBufferPointer(), builder_.GetSize()});
274-
binary_info = MakeBinaryInfo(qcir_binary, tensor_data);
294+
295+
custom_qcir_protocol_buffer_ =
296+
QnnQcirCustomProtocol(qcir_binary.nbytes, tensor_data.size());
297+
custom_qcir_protocol_buffer_.BuildQcirCustomBuffer(
298+
qcir_binary, tensor_data);
299+
std::tie(binary_info.buffer, binary_info.nbytes) =
300+
custom_qcir_protocol_buffer_.GetCustomProtocolBuffer();
275301
} else {
276302
if (qnn_manager_->Compile(graph_name, op_wrappers) !=
277303
executorch::runtime::Error::Ok) {
@@ -338,101 +364,41 @@ class PyQnnManager {
338364
return qnn_manager_->GetSpillFillBufferSize();
339365
}
340366

367+
QnnExecuTorchContextBinary MakeQcirCustomBinaryInfo(
368+
const QnnExecuTorchContextBinary& ctx_bin,
369+
const std::vector<uint8_t>& tensor_data) {
370+
custom_qcir_protocol_buffer_ =
371+
QnnQcirCustomProtocol(ctx_bin.nbytes, tensor_data.size());
372+
custom_qcir_protocol_buffer_.BuildQcirCustomBuffer(ctx_bin, tensor_data);
373+
auto [ptr, size] = custom_qcir_protocol_buffer_.GetCustomProtocolBuffer();
374+
return {ptr, size};
375+
}
376+
341377
py::array_t<char> MakeBinaryInfo(const py::bytes& ctx_bin) {
342378
py::buffer_info info(py::buffer(ctx_bin).request());
343379
QnnExecuTorchContextBinary binary(
344380
{info.ptr, static_cast<uint64_t>(info.size * info.itemsize)});
345-
std::vector<uint8_t> tensor_data;
346-
auto binary_info = MakeBinaryInfo(binary, tensor_data);
347-
auto result = py::array_t<char>(binary_info.nbytes);
381+
382+
auto qnn_context_custom_protocol = QnnContextCustomProtocol(binary.nbytes);
383+
qnn_context_custom_protocol.BuildContextCustomBuffer(binary);
384+
auto [custom_buffer_ptr, custom_buffer_size] =
385+
qnn_context_custom_protocol.GetCustomProtocolBuffer();
386+
387+
auto result = py::array_t<char>(custom_buffer_size);
348388
auto result_buffer = result.request();
349-
std::memcpy(result_buffer.ptr, binary_info.buffer, binary_info.nbytes);
389+
std::memcpy(result_buffer.ptr, custom_buffer_ptr, custom_buffer_size);
350390
return result;
351391
}
352392

353393
private:
354-
std::string signature() {
355-
return std::to_string(
356-
std::chrono::high_resolution_clock::now().time_since_epoch().count());
357-
};
358-
359-
QnnExecuTorchContextBinary MakeBinaryInfo(
360-
const QnnExecuTorchContextBinary& ctx_bin,
361-
const std::vector<const flatbuffers::Vector64<uint8_t>*>& tensor_data) {
362-
// the build order matters, 64 bit data is required to be shipped first
363-
// add context data
364-
builder64_.Reset();
365-
auto offset_context = builder64_.CreateVector<
366-
uint8_t,
367-
flatbuffers::Offset64,
368-
flatbuffers::Vector64>(
369-
static_cast<const uint8_t*>(ctx_bin.buffer), ctx_bin.nbytes);
370-
// add tensor data
371-
// this is a little bit tricky but have smallest memory footprint in AoT
372-
size_t buffer_size = 0;
373-
for (auto& td : tensor_data) {
374-
buffer_size += td->size();
375-
}
376-
builder64_.StartVector<
377-
uint8_t,
378-
flatbuffers::Offset64,
379-
flatbuffers::Vector64<uint8_t>::size_type>(buffer_size);
380-
for (int i = tensor_data.size() - 1; i >= 0; --i) {
381-
builder64_.PushBytes(tensor_data[i]->Data(), tensor_data[i]->size());
382-
}
383-
auto offset_tensor = flatbuffers::Offset64<flatbuffers::Vector64<uint8_t>>(
384-
builder64_.EndVector<
385-
flatbuffers::Vector64<uint8_t>::size_type,
386-
flatbuffers::Offset64<flatbuffers::Vector64<uint8_t>>::offset_type>(
387-
buffer_size));
388-
// add signature to binary for cache reuse in runtime
389-
auto offset_signature = builder64_.CreateString(signature().c_str());
390-
// build binary info
391-
auto binary_info = qnn_delegate::CreateBinaryInfo(
392-
builder64_, offset_signature, offset_context, offset_tensor);
393-
builder64_.Finish(binary_info);
394-
395-
return QnnExecuTorchContextBinary(
396-
{builder64_.GetBufferPointer(), builder64_.GetSize()});
397-
}
398-
399-
QnnExecuTorchContextBinary MakeBinaryInfo(
400-
const QnnExecuTorchContextBinary& ctx_bin,
401-
const std::vector<uint8_t>& tensor_data) {
402-
// the build order matters, 64 bit data is required to be shipped first
403-
// add context data
404-
builder64_.Reset();
405-
406-
auto offset_context = builder64_.CreateVector<
407-
uint8_t,
408-
flatbuffers::Offset64,
409-
flatbuffers::Vector64>(
410-
static_cast<const uint8_t*>(ctx_bin.buffer), ctx_bin.nbytes);
411-
// add tensor data
412-
auto offset_tensor = builder64_.CreateVector<
413-
uint8_t,
414-
flatbuffers::Offset64,
415-
flatbuffers::Vector64>(
416-
static_cast<const uint8_t*>(tensor_data.data()), tensor_data.size());
417-
// add signature to binary for cache reuse in runtime
418-
auto offset_signature = builder64_.CreateString(signature().c_str());
419-
// build binary info
420-
auto binary_info = qnn_delegate::CreateBinaryInfo(
421-
builder64_, offset_signature, offset_context, offset_tensor);
422-
builder64_.Finish(binary_info);
423-
424-
return QnnExecuTorchContextBinary(
425-
{builder64_.GetBufferPointer(), builder64_.GetSize()});
426-
}
427-
428394
// Store the bytes object instead of a raw pointer so that this module will
429395
// keep the bytes alive.
430396
const py::bytes qnn_executorch_option_ptr_;
431397
QnnExecuTorchContextBinary qnn_executorch_context_binary_;
432398
std::shared_ptr<QnnManager> qnn_manager_;
433-
flatbuffers::FlatBufferBuilder64 builder64_;
399+
QnnQcirCustomProtocol custom_qcir_protocol_buffer_;
400+
QnnContextCustomProtocol custom_context_custom_buffer_;
434401
flatbuffers::FlatBufferBuilder builder_;
435-
flatbuffers::Verifier::Options fb_opt_;
436402
};
437403
} // namespace qnn
438404
} // namespace backends

backends/qualcomm/aot/python/targets.bzl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ def define_common_targets():
3131
"//executorch/backends/qualcomm/aot/wrappers:wrappers",
3232
"//executorch/backends/qualcomm/runtime:logging",
3333
"//executorch/backends/qualcomm:schema",
34-
"//executorch/backends/qualcomm:qc_binary_info_schema",
3534
"//executorch/backends/qualcomm/aot/ir:qcir_utils",
3635
"//executorch/backends/qualcomm/runtime:runtime",
3736
"fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),

backends/qualcomm/runtime/QnnExecuTorch.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@
1919
#ifdef __cplusplus
2020
extern "C" {
2121
#endif // __cplusplus
22+
23+
// This could be:
24+
// 1. qnn_context_binary
25+
// 2. QnnQcirCustomProtocol
26+
// 3. QnnContextCustomProtocol
27+
// To check if it is custom protocol, users can deserialize the binary using
28+
// QnnCustomProtocol and check the status
2229
typedef struct {
2330
/// qnn_context_binary_blob
2431
void* buffer;

backends/qualcomm/runtime/QnnExecuTorchBackend.cpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
1111
#include <executorch/backends/qualcomm/runtime/QnnExecuTorchBackend.h>
1212
#include <executorch/backends/qualcomm/runtime/QnnManager.h>
13+
#include <executorch/backends/qualcomm/runtime/backends/QnnCustomProtocol.h>
1314

1415
namespace executorch {
1516
namespace backends {
@@ -36,8 +37,23 @@ Result<DelegateHandle*> QnnExecuTorchBackend::init(
3637
QnnExecuTorchContextBinary qnn_context_blob;
3738
const qnn_delegate::QnnExecuTorchOptions* qnn_executorch_options = nullptr;
3839

39-
qnn_context_blob.buffer = const_cast<void*>(processed->data());
40-
qnn_context_blob.nbytes = processed->size();
40+
auto [status, signature, ctx_size, ctx_bin] =
41+
QnnContextCustomProtocol().DeserializeContextCustomBuffer(
42+
const_cast<void*>(processed->data()));
43+
if (status == Error::Ok) {
44+
QNN_EXECUTORCH_LOG_INFO(
45+
"Deserializing processed data using QnnContextCustomProtocol");
46+
// After this stage, qnn_context_blob.nbytes & qnn_context_blob.buffer will
47+
// only store qnn_context_binary.
48+
qnn_context_blob.nbytes = ctx_size;
49+
qnn_context_blob.buffer = ctx_bin;
50+
} else {
51+
// This buffer will be verified again in QnnBackendCache.
52+
QNN_EXECUTORCH_LOG_INFO(
53+
"Deserializing processed data using QnnQcirCustomProtocol");
54+
qnn_context_blob.buffer = const_cast<void*>(processed->data());
55+
qnn_context_blob.nbytes = processed->size();
56+
}
4157

4258
// convert CompileSpec to qnn ExecuTorch option
4359
for (auto& compile_spec : compile_specs) {
@@ -62,7 +78,7 @@ Result<DelegateHandle*> QnnExecuTorchBackend::init(
6278
// ---
6379
// check if current context binary has already been initialized
6480
// return cached one for reducing memory footprint
65-
std::string signature = qnn_manager->GetBinarySignature();
81+
6682
auto iter = delegate_map_.find(signature);
6783
if (iter != delegate_map_.end()) {
6884
QNN_EXECUTORCH_LOG_INFO(
@@ -186,7 +202,7 @@ bool QnnExecuTorchBackend::is_available() const {
186202
}
187203

188204
void QnnExecuTorchBackend::add_cached_delegate(
189-
const std::string& signature,
205+
const std::int64_t& signature,
190206
executorch::runtime::DelegateHandle* handle) const {
191207
std::lock_guard<std::mutex> guard(mutex_);
192208
delegate_map_[signature] = handle;

backends/qualcomm/runtime/QnnExecuTorchBackend.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,14 @@ class QnnExecuTorchBackend final
4040

4141
private:
4242
void add_cached_delegate(
43-
const std::string& signature,
43+
const std::int64_t& signature,
4444
executorch::runtime::DelegateHandle* handle) const;
4545
void erase_cached_delegate(executorch::runtime::DelegateHandle* handle) const;
4646

4747
mutable std::mutex mutex_;
48-
mutable std::unordered_map<std::string, executorch::runtime::DelegateHandle*>
48+
mutable std::unordered_map<int64_t, executorch::runtime::DelegateHandle*>
4949
delegate_map_;
50-
mutable std::unordered_map<executorch::runtime::DelegateHandle*, std::string>
50+
mutable std::unordered_map<executorch::runtime::DelegateHandle*, std::int64_t>
5151
delegate_map_rev_;
5252
};
5353

0 commit comments

Comments
 (0)