Skip to content

Commit 645fb63

Browse files
committed
Qualcomm AI Engine Direct - Support Hybrid Mode for Llama3.2
1 parent 383aa70 commit 645fb63

23 files changed

+1022
-394
lines changed

backends/qualcomm/aot/ir/qcir.fbs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,17 +80,18 @@ table Tensor {
8080
type: TensorType;
8181
dtype: DataType;
8282
qparam: QuantizeParam;
83-
data: [ubyte];
83+
size: uint;
84+
offset: ulong;
8485
}
8586

8687
table Operator {
8788
name: string;
8889
package_name: string;
8990
type_name: string;
9091
// keep only tensor indexes
91-
inputs: [int];
92-
outputs: [int];
93-
params: [int];
92+
inputs: [uint];
93+
outputs: [uint];
94+
params: [uint];
9495
}
9596

9697
table Graph {

backends/qualcomm/aot/ir/qcir_utils.cpp

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -235,11 +235,8 @@ Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor) {
235235

236236
flatbuffers::Offset<qcir::Tensor> ToTensor(
237237
const Qnn_Tensor_t& tensor,
238+
const uint64_t data_offset,
238239
flatbuffers::FlatBufferBuilder* builder) {
239-
std::vector<uint8_t> buffer(
240-
static_cast<uint8_t*>(QNN_VER_PTR(tensor)->clientBuf.data),
241-
static_cast<uint8_t*>(QNN_VER_PTR(tensor)->clientBuf.data) +
242-
QNN_VER_PTR(tensor)->clientBuf.dataSize);
243240
std::vector<uint32_t> shape(
244241
QNN_VER_PTR(tensor)->dimensions,
245242
QNN_VER_PTR(tensor)->dimensions + QNN_VER_PTR(tensor)->rank);
@@ -251,10 +248,11 @@ flatbuffers::Offset<qcir::Tensor> ToTensor(
251248
ToTensorType(QNN_VER_PTR(tensor)->type),
252249
ToDataType(QNN_VER_PTR(tensor)->dataType),
253250
ToQuantizeParam(tensor, builder),
254-
&buffer);
251+
QNN_VER_PTR(tensor)->clientBuf.dataSize,
252+
data_offset);
255253
}
256254

257-
Qnn_Tensor_t ToTensor(const tensor_type& tensor) {
255+
Qnn_Tensor_t ToTensor(const tensor_type& tensor, const uint8_t* data_ptr) {
258256
auto is_io_tensor = [](Qnn_TensorType_t type) {
259257
return type < QNN_TENSOR_TYPE_STATIC;
260258
};
@@ -266,10 +264,10 @@ Qnn_Tensor_t ToTensor(const tensor_type& tensor) {
266264
QNN_VER_PTR(t)->quantizeParams = ToQuantizeParam(tensor);
267265
QNN_VER_PTR(t)->rank = tensor->shape()->size();
268266
QNN_VER_PTR(t)->dimensions = const_cast<uint32_t*>(tensor->shape()->data());
269-
QNN_VER_PTR(t)->clientBuf.dataSize = tensor->data()->size();
267+
QNN_VER_PTR(t)->clientBuf.dataSize = tensor->size();
270268
QNN_VER_PTR(t)->clientBuf.data = is_io_tensor(QNN_VER_PTR(t)->type)
271269
? nullptr
272-
: static_cast<void*>(const_cast<uint8_t*>(tensor->data()->Data()));
270+
: static_cast<void*>(const_cast<uint8_t*>(data_ptr));
273271
return t;
274272
}
275273

backends/qualcomm/aot/ir/qcir_utils.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@ Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor);
3232

3333
flatbuffers::Offset<qcir::Tensor> ToTensor(
3434
const Qnn_Tensor_t& tensor,
35+
const uint64_t data_offset,
3536
flatbuffers::FlatBufferBuilder* builder);
36-
Qnn_Tensor_t ToTensor(const tensor_type& tensor);
37+
Qnn_Tensor_t ToTensor(const tensor_type& tensor, const uint8_t* data_ptr);
3738

3839
} // namespace qnn
3940
} // namespace backends

backends/qualcomm/aot/python/PyQnnManagerAdaptor.h

Lines changed: 145 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -57,47 +57,67 @@ class PyQnnManager {
5757
qnn_executorch_option_ptr_.cast<std::string_view>().data());
5858

5959
// merge multiple qcirs into one context with multiple graphs
60-
std::vector<flatbuffers::Offset<qcir::Graph>> graphs;
60+
61+
// this makes it easier to do subtraction for offsets
62+
std::vector<uint32_t> offsets(1, 0);
63+
std::vector<const flatbuffers::Vector64<uint8_t>*> tensor_data;
64+
fb_opt_.max_size = FLATBUFFERS_MAX_64_BUFFER_SIZE;
6165
for (size_t i = 0; i < qcirs.size(); ++i) {
6266
py::buffer_info info(py::buffer(qcirs[i].cast<py::bytes>()).request());
6367
flatbuffers::Verifier verifier_binary_info(
6468
static_cast<const uint8_t* const>(info.ptr),
65-
info.size * info.itemsize);
69+
info.size * info.itemsize,
70+
fb_opt_);
6671
if (!qnn_delegate::VerifyBinaryInfoBuffer(verifier_binary_info)) {
6772
QNN_EXECUTORCH_LOG_ERROR("Fail to verify binary info");
6873
return;
6974
}
7075
auto binary_info = qnn_delegate::GetBinaryInfo(info.ptr);
76+
tensor_data.push_back(binary_info->tensor_data());
7177

7278
flatbuffers::Verifier verifier_qcir(
73-
binary_info->data()->data(), binary_info->data()->size());
79+
binary_info->context_data()->Data(),
80+
binary_info->context_data()->size());
7481
if (!qcir::VerifyContextBuffer(verifier_qcir)) {
7582
QNN_EXECUTORCH_LOG_ERROR("Fail to verify qcir format");
7683
return;
7784
}
78-
auto context = qcir::GetContext(binary_info->data()->data());
85+
offsets.push_back(offsets.back() + binary_info->tensor_data()->size());
86+
}
87+
88+
std::vector<flatbuffers::Offset<qcir::Graph>> graphs;
89+
for (size_t i = 0; i < qcirs.size(); ++i) {
90+
py::buffer_info info(py::buffer(qcirs[i].cast<py::bytes>()).request());
91+
auto binary_info = qnn_delegate::GetBinaryInfo(info.ptr);
92+
auto context = qcir::GetContext(binary_info->context_data()->Data());
7993
for (const auto& graph : *context->graphs()) {
8094
std::vector<flatbuffers::Offset<qcir::Tensor>> tensors;
8195
for (const auto tensor : *graph->tensors()) {
8296
// here we need to take a detour to merge multiple qcir flatbuffers
8397
// outer ToTensor
8498
// return: flatbuffers::Offset<Tensor>
85-
// consume: QnnTensor, flatbuffers::FlatBufferBuilder*
99+
// consume: QnnTensor, data_offset, flatbuffers::FlatBufferBuilder*
86100
// inner ToTensor
87101
// return: QnnTensor
88-
// consume: flatbuffers::Vector<::flatbuffers::Offset<qcir::Tensor>>
89-
tensors.emplace_back(ToTensor(ToTensor(tensor), &builder_));
102+
// consume:
103+
// flatbuffers::Vector<::flatbuffers::Offset<qcir::Tensor>>,
104+
// data_ptr
105+
tensors.emplace_back(ToTensor(
106+
ToTensor(tensor, nullptr),
107+
offsets[i] + tensor->offset(),
108+
&builder_));
90109
}
91110
std::vector<flatbuffers::Offset<qcir::Operator>> nodes;
92111
for (const auto& node : *graph->nodes()) {
93-
int32_t* inputs_ptr = const_cast<int32_t*>(node->inputs()->data());
94-
int32_t* outputs_ptr = const_cast<int32_t*>(node->outputs()->data());
95-
int32_t* params_ptr = const_cast<int32_t*>(node->params()->data());
96-
std::vector<int32_t> inputs(
112+
uint32_t* inputs_ptr = const_cast<uint32_t*>(node->inputs()->data());
113+
uint32_t* outputs_ptr =
114+
const_cast<uint32_t*>(node->outputs()->data());
115+
uint32_t* params_ptr = const_cast<uint32_t*>(node->params()->data());
116+
std::vector<uint32_t> inputs(
97117
inputs_ptr, inputs_ptr + node->inputs()->size());
98-
std::vector<int32_t> outputs(
118+
std::vector<uint32_t> outputs(
99119
outputs_ptr, outputs_ptr + node->outputs()->size());
100-
std::vector<int32_t> params(
120+
std::vector<uint32_t> params(
101121
params_ptr, params_ptr + node->params()->size());
102122
nodes.emplace_back(qcir::CreateOperatorDirect(
103123
builder_,
@@ -118,7 +138,7 @@ class PyQnnManager {
118138
QnnExecuTorchContextBinary qcir_bin(
119139
{builder_.GetBufferPointer(), builder_.GetSize()});
120140

121-
qnn_executorch_context_binary_ = MakeBinaryInfo(qcir_bin);
141+
qnn_executorch_context_binary_ = MakeBinaryInfo(qcir_bin, tensor_data);
122142
qnn_manager_ = std::make_shared<QnnManager>(
123143
qnn_executorch_options, qnn_executorch_context_binary_);
124144
}
@@ -157,26 +177,37 @@ class PyQnnManager {
157177

158178
if (qnn_manager_->IsOnlinePrepare() || qnn_manager_->IsMultipleGraphs()) {
159179
builder_.Reset();
160-
std::vector<flatbuffers::Offset<qcir::Tensor>> tensors;
180+
std::vector<uint8_t> tensor_data;
181+
std::vector<uint64_t> offsets;
161182
std::unordered_map<void*, int> tensor_map;
183+
std::vector<flatbuffers::Offset<qcir::Tensor>> fb_tensors;
184+
std::vector<flatbuffers::Offset<qcir::Operator>> fb_ops;
162185

163186
auto set_tensor = [&](const std::shared_ptr<TensorWrapper>& wrapper,
164-
std::vector<int>& index) {
187+
std::vector<uint32_t>& index) {
165188
auto it = tensor_map.find(wrapper.get());
166189
if (it != tensor_map.end()) {
167190
index.push_back(it->second);
168191
} else {
169-
int i = tensors.size();
170-
tensor_map[wrapper.get()] = i;
171-
index.push_back(i);
172-
tensors.emplace_back(
173-
ToTensor(wrapper->CloneTensorStruct(), &builder_));
192+
tensor_map[wrapper.get()] = fb_tensors.size();
193+
index.push_back(fb_tensors.size());
194+
offsets.push_back(tensor_data.size());
195+
Qnn_Tensor_t qnn_tensor = wrapper->CloneTensorStruct();
196+
fb_tensors.emplace_back(
197+
ToTensor(qnn_tensor, offsets.back(), &builder_));
198+
uint8_t* data_ptr =
199+
static_cast<uint8_t*>(QNN_VER_PTR(qnn_tensor)->clientBuf.data);
200+
if (data_ptr != nullptr) {
201+
tensor_data.insert(
202+
tensor_data.end(),
203+
data_ptr,
204+
data_ptr + QNN_VER_PTR(qnn_tensor)->clientBuf.dataSize);
205+
}
174206
}
175207
};
176208

177-
std::vector<flatbuffers::Offset<qcir::Operator>> operators;
178209
for (std::shared_ptr<OpWrapper>& op_wrapper : op_wrappers) {
179-
std::vector<int> inputs, outputs, params;
210+
std::vector<uint32_t> inputs, outputs, params;
180211

181212
for (const auto& tensor_wrapper : op_wrapper->GetInputTensors()) {
182213
set_tensor(tensor_wrapper, inputs);
@@ -207,13 +238,22 @@ class PyQnnManager {
207238
static_cast<void*>(&p.scalarParam.uint8Value);
208239
QNN_VER_PTR(t)->clientBuf.dataSize =
209240
GetDataTypeSize(QNN_VER_PTR(t)->dataType);
210-
params.push_back(tensors.size());
211-
tensors.emplace_back(ToTensor(t, &builder_));
241+
242+
// collect tensor data
243+
offsets.push_back(tensor_data.size());
244+
const uint8_t* data_ptr =
245+
static_cast<uint8_t*>(QNN_VER_PTR(t)->clientBuf.data);
246+
tensor_data.insert(
247+
tensor_data.end(),
248+
data_ptr,
249+
data_ptr + QNN_VER_PTR(t)->clientBuf.dataSize);
250+
params.push_back(fb_tensors.size());
251+
fb_tensors.emplace_back(ToTensor(t, offsets.back(), &builder_));
212252
}
213253
}
214254

215255
Qnn_OpConfig_t op_config = op_wrapper->GetOpConfig();
216-
operators.emplace_back(qcir::CreateOperatorDirect(
256+
fb_ops.emplace_back(qcir::CreateOperatorDirect(
217257
builder_,
218258
QNN_VER_PTR(op_config)->name,
219259
QNN_VER_PTR(op_config)->packageName,
@@ -222,14 +262,16 @@ class PyQnnManager {
222262
&outputs,
223263
&params));
224264
}
225-
auto graph = qcir::CreateGraphDirect(
226-
builder_, graph_name.c_str(), &operators, &tensors);
227-
std::vector<flatbuffers::Offset<qcir::Graph>> graphs({graph});
228-
auto context = qcir::CreateContextDirect(builder_, &graphs);
265+
266+
std::vector<flatbuffers::Offset<qcir::Graph>> fb_graphs(
267+
{qcir::CreateGraphDirect(
268+
builder_, graph_name.c_str(), &fb_ops, &fb_tensors)});
269+
auto context = qcir::CreateContextDirect(builder_, &fb_graphs);
229270
builder_.Finish(context);
271+
230272
QnnExecuTorchContextBinary qcir_binary(
231273
{builder_.GetBufferPointer(), builder_.GetSize()});
232-
binary_info = MakeBinaryInfo(qcir_binary);
274+
binary_info = MakeBinaryInfo(qcir_binary, tensor_data);
233275
} else {
234276
if (qnn_manager_->Compile(graph_name, op_wrappers) !=
235277
executorch::runtime::Error::Ok) {
@@ -300,38 +342,97 @@ class PyQnnManager {
300342
py::buffer_info info(py::buffer(ctx_bin).request());
301343
QnnExecuTorchContextBinary binary(
302344
{info.ptr, static_cast<uint64_t>(info.size * info.itemsize)});
303-
auto binary_info = MakeBinaryInfo(binary);
345+
std::vector<uint8_t> tensor_data;
346+
auto binary_info = MakeBinaryInfo(binary, tensor_data);
304347
auto result = py::array_t<char>(binary_info.nbytes);
305348
auto result_buffer = result.request();
306349
std::memcpy(result_buffer.ptr, binary_info.buffer, binary_info.nbytes);
307350
return result;
308351
}
309352

310353
private:
354+
std::string signature() {
355+
return std::to_string(
356+
std::chrono::high_resolution_clock::now().time_since_epoch().count());
357+
};
358+
311359
QnnExecuTorchContextBinary MakeBinaryInfo(
312-
const QnnExecuTorchContextBinary& ctx_bin) {
313-
auto signature = []() {
314-
return std::to_string(
315-
std::chrono::high_resolution_clock::now().time_since_epoch().count());
316-
};
317-
const uint8_t* base = static_cast<uint8_t*>(ctx_bin.buffer);
318-
std::vector<uint8_t> data(base, base + ctx_bin.nbytes);
360+
const QnnExecuTorchContextBinary& ctx_bin,
361+
const std::vector<const flatbuffers::Vector64<uint8_t>*>& tensor_data) {
362+
// the build order matters, 64 bit data is required to be shipped first
363+
// add context data
364+
builder64_.Reset();
365+
auto offset_context = builder64_.CreateVector<
366+
uint8_t,
367+
flatbuffers::Offset64,
368+
flatbuffers::Vector64>(
369+
static_cast<const uint8_t*>(ctx_bin.buffer), ctx_bin.nbytes);
370+
// add tensor data
371+
// this is a little bit tricky but have smallest memory footprint in AoT
372+
size_t buffer_size = 0;
373+
for (auto& td : tensor_data) {
374+
buffer_size += td->size();
375+
}
376+
builder64_.StartVector<
377+
uint8_t,
378+
flatbuffers::Offset64,
379+
flatbuffers::Vector64<uint8_t>::size_type>(buffer_size);
380+
for (int i = tensor_data.size() - 1; i >= 0; --i) {
381+
builder64_.PushBytes(tensor_data[i]->Data(), tensor_data[i]->size());
382+
}
383+
auto offset_tensor = flatbuffers::Offset64<flatbuffers::Vector64<uint8_t>>(
384+
builder64_.EndVector<
385+
flatbuffers::Vector64<uint8_t>::size_type,
386+
flatbuffers::Offset64<flatbuffers::Vector64<uint8_t>>::offset_type>(
387+
buffer_size));
319388
// add signature to binary for cache reuse in runtime
320-
builder_.Reset();
321-
auto binary_info = qnn_delegate::CreateBinaryInfoDirect(
322-
builder_, signature().c_str(), &data);
323-
builder_.Finish(binary_info);
389+
auto offset_signature = builder64_.CreateString(signature().c_str());
390+
// build binary info
391+
auto binary_info = qnn_delegate::CreateBinaryInfo(
392+
builder64_, offset_signature, offset_context, offset_tensor);
393+
builder64_.Finish(binary_info);
324394

325395
return QnnExecuTorchContextBinary(
326-
{builder_.GetBufferPointer(), builder_.GetSize()});
396+
{builder64_.GetBufferPointer(), builder64_.GetSize()});
397+
}
398+
399+
QnnExecuTorchContextBinary MakeBinaryInfo(
400+
const QnnExecuTorchContextBinary& ctx_bin,
401+
const std::vector<uint8_t>& tensor_data) {
402+
// the build order matters, 64 bit data is required to be shipped first
403+
// add context data
404+
builder64_.Reset();
405+
406+
auto offset_context = builder64_.CreateVector<
407+
uint8_t,
408+
flatbuffers::Offset64,
409+
flatbuffers::Vector64>(
410+
static_cast<const uint8_t*>(ctx_bin.buffer), ctx_bin.nbytes);
411+
// add tensor data
412+
auto offset_tensor = builder64_.CreateVector<
413+
uint8_t,
414+
flatbuffers::Offset64,
415+
flatbuffers::Vector64>(
416+
static_cast<const uint8_t*>(tensor_data.data()), tensor_data.size());
417+
// add signature to binary for cache reuse in runtime
418+
auto offset_signature = builder64_.CreateString(signature().c_str());
419+
// build binary info
420+
auto binary_info = qnn_delegate::CreateBinaryInfo(
421+
builder64_, offset_signature, offset_context, offset_tensor);
422+
builder64_.Finish(binary_info);
423+
424+
return QnnExecuTorchContextBinary(
425+
{builder64_.GetBufferPointer(), builder64_.GetSize()});
327426
}
328427

329428
// Store the bytes object instead of a raw pointer so that this module will
330429
// keep the bytes alive.
331430
const py::bytes qnn_executorch_option_ptr_;
332431
QnnExecuTorchContextBinary qnn_executorch_context_binary_;
333432
std::shared_ptr<QnnManager> qnn_manager_;
433+
flatbuffers::FlatBufferBuilder64 builder64_;
334434
flatbuffers::FlatBufferBuilder builder_;
435+
flatbuffers::Verifier::Options fb_opt_;
335436
};
336437
} // namespace qnn
337438
} // namespace backends

backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -79,18 +79,6 @@ std::shared_ptr<TensorWrapper> CreateTensorWrapper(
7979
std::unique_ptr<QuantizeParamsWrapper> quantize_param_wrapper =
8080
CreateQuantizationParamWrapper(encoding, quant_info);
8181

82-
if (data.size() == 0) {
83-
return CreateTensorWrapper(
84-
tensor_name,
85-
tensor_type,
86-
data_type,
87-
std::move(quantize_param_wrapper),
88-
rank,
89-
dims.data(),
90-
0,
91-
nullptr,
92-
copy_data);
93-
}
9482
return CreateTensorWrapper(
9583
tensor_name,
9684
tensor_type,
@@ -99,7 +87,7 @@ std::shared_ptr<TensorWrapper> CreateTensorWrapper(
9987
rank,
10088
dims.data(),
10189
0,
102-
data.data(),
90+
data.size() == 0 ? nullptr : data.data(),
10391
copy_data);
10492
}
10593

0 commit comments

Comments
 (0)