@@ -57,47 +57,67 @@ class PyQnnManager {
57
57
qnn_executorch_option_ptr_.cast <std::string_view>().data ());
58
58
59
59
// merge multiple qcirs into one context with multiple graphs
60
- std::vector<flatbuffers::Offset<qcir::Graph>> graphs;
60
+
61
+ // this makes it easier to do subtraction for offsets
62
+ std::vector<uint32_t > offsets (1 , 0 );
63
+ std::vector<const flatbuffers::Vector64<uint8_t >*> tensor_data;
64
+ fb_opt_.max_size = FLATBUFFERS_MAX_64_BUFFER_SIZE;
61
65
for (size_t i = 0 ; i < qcirs.size (); ++i) {
62
66
py::buffer_info info (py::buffer (qcirs[i].cast <py::bytes>()).request ());
63
67
flatbuffers::Verifier verifier_binary_info (
64
68
static_cast <const uint8_t * const >(info.ptr ),
65
- info.size * info.itemsize );
69
+ info.size * info.itemsize ,
70
+ fb_opt_);
66
71
if (!qnn_delegate::VerifyBinaryInfoBuffer (verifier_binary_info)) {
67
72
QNN_EXECUTORCH_LOG_ERROR (" Fail to verify binary info" );
68
73
return ;
69
74
}
70
75
auto binary_info = qnn_delegate::GetBinaryInfo (info.ptr );
76
+ tensor_data.push_back (binary_info->tensor_data ());
71
77
72
78
flatbuffers::Verifier verifier_qcir (
73
- binary_info->data ()->data (), binary_info->data ()->size ());
79
+ binary_info->context_data ()->Data (),
80
+ binary_info->context_data ()->size ());
74
81
if (!qcir::VerifyContextBuffer (verifier_qcir)) {
75
82
QNN_EXECUTORCH_LOG_ERROR (" Fail to verify qcir format" );
76
83
return ;
77
84
}
78
- auto context = qcir::GetContext (binary_info->data ()->data ());
85
+ offsets.push_back (offsets.back () + binary_info->tensor_data ()->size ());
86
+ }
87
+
88
+ std::vector<flatbuffers::Offset<qcir::Graph>> graphs;
89
+ for (size_t i = 0 ; i < qcirs.size (); ++i) {
90
+ py::buffer_info info (py::buffer (qcirs[i].cast <py::bytes>()).request ());
91
+ auto binary_info = qnn_delegate::GetBinaryInfo (info.ptr );
92
+ auto context = qcir::GetContext (binary_info->context_data ()->Data ());
79
93
for (const auto & graph : *context->graphs ()) {
80
94
std::vector<flatbuffers::Offset<qcir::Tensor>> tensors;
81
95
for (const auto tensor : *graph->tensors ()) {
82
96
// here we need to take a detour to merge multiple qcir flatbuffers
83
97
// outer ToTensor
84
98
// return: flatbuffers::Offset<Tensor>
85
- // consume: QnnTensor, flatbuffers::FlatBufferBuilder*
99
+ // consume: QnnTensor, data_offset, flatbuffers::FlatBufferBuilder*
86
100
// inner ToTensor
87
101
// return: QnnTensor
88
- // consume: flatbuffers::Vector<::flatbuffers::Offset<qcir::Tensor>>
89
- tensors.emplace_back (ToTensor (ToTensor (tensor), &builder_));
102
+ // consume:
103
+ // flatbuffers::Vector<::flatbuffers::Offset<qcir::Tensor>>,
104
+ // data_ptr
105
+ tensors.emplace_back (ToTensor (
106
+ ToTensor (tensor, nullptr ),
107
+ offsets[i] + tensor->offset (),
108
+ &builder_));
90
109
}
91
110
std::vector<flatbuffers::Offset<qcir::Operator>> nodes;
92
111
for (const auto & node : *graph->nodes ()) {
93
- int32_t * inputs_ptr = const_cast <int32_t *>(node->inputs ()->data ());
94
- int32_t * outputs_ptr = const_cast <int32_t *>(node->outputs ()->data ());
95
- int32_t * params_ptr = const_cast <int32_t *>(node->params ()->data ());
96
- std::vector<int32_t > inputs (
112
+ uint32_t * inputs_ptr = const_cast <uint32_t *>(node->inputs ()->data ());
113
+ uint32_t * outputs_ptr =
114
+ const_cast <uint32_t *>(node->outputs ()->data ());
115
+ uint32_t * params_ptr = const_cast <uint32_t *>(node->params ()->data ());
116
+ std::vector<uint32_t > inputs (
97
117
inputs_ptr, inputs_ptr + node->inputs ()->size ());
98
- std::vector<int32_t > outputs (
118
+ std::vector<uint32_t > outputs (
99
119
outputs_ptr, outputs_ptr + node->outputs ()->size ());
100
- std::vector<int32_t > params (
120
+ std::vector<uint32_t > params (
101
121
params_ptr, params_ptr + node->params ()->size ());
102
122
nodes.emplace_back (qcir::CreateOperatorDirect (
103
123
builder_,
@@ -118,7 +138,7 @@ class PyQnnManager {
118
138
QnnExecuTorchContextBinary qcir_bin (
119
139
{builder_.GetBufferPointer (), builder_.GetSize ()});
120
140
121
- qnn_executorch_context_binary_ = MakeBinaryInfo (qcir_bin);
141
+ qnn_executorch_context_binary_ = MakeBinaryInfo (qcir_bin, tensor_data );
122
142
qnn_manager_ = std::make_shared<QnnManager>(
123
143
qnn_executorch_options, qnn_executorch_context_binary_);
124
144
}
@@ -157,26 +177,37 @@ class PyQnnManager {
157
177
158
178
if (qnn_manager_->IsOnlinePrepare () || qnn_manager_->IsMultipleGraphs ()) {
159
179
builder_.Reset ();
160
- std::vector<flatbuffers::Offset<qcir::Tensor>> tensors;
180
+ std::vector<uint8_t > tensor_data;
181
+ std::vector<uint64_t > offsets;
161
182
std::unordered_map<void *, int > tensor_map;
183
+ std::vector<flatbuffers::Offset<qcir::Tensor>> fb_tensors;
184
+ std::vector<flatbuffers::Offset<qcir::Operator>> fb_ops;
162
185
163
186
auto set_tensor = [&](const std::shared_ptr<TensorWrapper>& wrapper,
164
- std::vector<int >& index) {
187
+ std::vector<uint32_t >& index) {
165
188
auto it = tensor_map.find (wrapper.get ());
166
189
if (it != tensor_map.end ()) {
167
190
index.push_back (it->second );
168
191
} else {
169
- int i = tensors.size ();
170
- tensor_map[wrapper.get ()] = i;
171
- index.push_back (i);
172
- tensors.emplace_back (
173
- ToTensor (wrapper->CloneTensorStruct (), &builder_));
192
+ tensor_map[wrapper.get ()] = fb_tensors.size ();
193
+ index.push_back (fb_tensors.size ());
194
+ offsets.push_back (tensor_data.size ());
195
+ Qnn_Tensor_t qnn_tensor = wrapper->CloneTensorStruct ();
196
+ fb_tensors.emplace_back (
197
+ ToTensor (qnn_tensor, offsets.back (), &builder_));
198
+ uint8_t * data_ptr =
199
+ static_cast <uint8_t *>(QNN_VER_PTR (qnn_tensor)->clientBuf .data );
200
+ if (data_ptr != nullptr ) {
201
+ tensor_data.insert (
202
+ tensor_data.end (),
203
+ data_ptr,
204
+ data_ptr + QNN_VER_PTR (qnn_tensor)->clientBuf .dataSize );
205
+ }
174
206
}
175
207
};
176
208
177
- std::vector<flatbuffers::Offset<qcir::Operator>> operators;
178
209
for (std::shared_ptr<OpWrapper>& op_wrapper : op_wrappers) {
179
- std::vector<int > inputs, outputs, params;
210
+ std::vector<uint32_t > inputs, outputs, params;
180
211
181
212
for (const auto & tensor_wrapper : op_wrapper->GetInputTensors ()) {
182
213
set_tensor (tensor_wrapper, inputs);
@@ -207,13 +238,22 @@ class PyQnnManager {
207
238
static_cast <void *>(&p.scalarParam .uint8Value );
208
239
QNN_VER_PTR (t)->clientBuf .dataSize =
209
240
GetDataTypeSize (QNN_VER_PTR (t)->dataType );
210
- params.push_back (tensors.size ());
211
- tensors.emplace_back (ToTensor (t, &builder_));
241
+
242
+ // collect tensor data
243
+ offsets.push_back (tensor_data.size ());
244
+ const uint8_t * data_ptr =
245
+ static_cast <uint8_t *>(QNN_VER_PTR (t)->clientBuf .data );
246
+ tensor_data.insert (
247
+ tensor_data.end (),
248
+ data_ptr,
249
+ data_ptr + QNN_VER_PTR (t)->clientBuf .dataSize );
250
+ params.push_back (fb_tensors.size ());
251
+ fb_tensors.emplace_back (ToTensor (t, offsets.back (), &builder_));
212
252
}
213
253
}
214
254
215
255
Qnn_OpConfig_t op_config = op_wrapper->GetOpConfig ();
216
- operators .emplace_back (qcir::CreateOperatorDirect (
256
+ fb_ops .emplace_back (qcir::CreateOperatorDirect (
217
257
builder_,
218
258
QNN_VER_PTR (op_config)->name ,
219
259
QNN_VER_PTR (op_config)->packageName ,
@@ -222,14 +262,16 @@ class PyQnnManager {
222
262
&outputs,
223
263
¶ms));
224
264
}
225
- auto graph = qcir::CreateGraphDirect (
226
- builder_, graph_name.c_str (), &operators, &tensors);
227
- std::vector<flatbuffers::Offset<qcir::Graph>> graphs ({graph});
228
- auto context = qcir::CreateContextDirect (builder_, &graphs);
265
+
266
+ std::vector<flatbuffers::Offset<qcir::Graph>> fb_graphs (
267
+ {qcir::CreateGraphDirect (
268
+ builder_, graph_name.c_str (), &fb_ops, &fb_tensors)});
269
+ auto context = qcir::CreateContextDirect (builder_, &fb_graphs);
229
270
builder_.Finish (context);
271
+
230
272
QnnExecuTorchContextBinary qcir_binary (
231
273
{builder_.GetBufferPointer (), builder_.GetSize ()});
232
- binary_info = MakeBinaryInfo (qcir_binary);
274
+ binary_info = MakeBinaryInfo (qcir_binary, tensor_data );
233
275
} else {
234
276
if (qnn_manager_->Compile (graph_name, op_wrappers) !=
235
277
executorch::runtime::Error::Ok) {
@@ -300,38 +342,97 @@ class PyQnnManager {
300
342
py::buffer_info info (py::buffer (ctx_bin).request ());
301
343
QnnExecuTorchContextBinary binary (
302
344
{info.ptr , static_cast <uint64_t >(info.size * info.itemsize )});
303
- auto binary_info = MakeBinaryInfo (binary);
345
+ std::vector<uint8_t > tensor_data;
346
+ auto binary_info = MakeBinaryInfo (binary, tensor_data);
304
347
auto result = py::array_t <char >(binary_info.nbytes );
305
348
auto result_buffer = result.request ();
306
349
std::memcpy (result_buffer.ptr , binary_info.buffer , binary_info.nbytes );
307
350
return result;
308
351
}
309
352
310
353
private:
354
+ std::string signature () {
355
+ return std::to_string (
356
+ std::chrono::high_resolution_clock::now ().time_since_epoch ().count ());
357
+ };
358
+
311
359
QnnExecuTorchContextBinary MakeBinaryInfo (
312
- const QnnExecuTorchContextBinary& ctx_bin) {
313
- auto signature = []() {
314
- return std::to_string (
315
- std::chrono::high_resolution_clock::now ().time_since_epoch ().count ());
316
- };
317
- const uint8_t * base = static_cast <uint8_t *>(ctx_bin.buffer );
318
- std::vector<uint8_t > data (base, base + ctx_bin.nbytes );
360
+ const QnnExecuTorchContextBinary& ctx_bin,
361
+ const std::vector<const flatbuffers::Vector64<uint8_t >*>& tensor_data) {
362
+ // the build order matters, 64 bit data is required to be shipped first
363
+ // add context data
364
+ builder64_.Reset ();
365
+ auto offset_context = builder64_.CreateVector <
366
+ uint8_t ,
367
+ flatbuffers::Offset64,
368
+ flatbuffers::Vector64>(
369
+ static_cast <const uint8_t *>(ctx_bin.buffer ), ctx_bin.nbytes );
370
+ // add tensor data
371
+ // this is a little bit tricky but have smallest memory footprint in AoT
372
+ size_t buffer_size = 0 ;
373
+ for (auto & td : tensor_data) {
374
+ buffer_size += td->size ();
375
+ }
376
+ builder64_.StartVector <
377
+ uint8_t ,
378
+ flatbuffers::Offset64,
379
+ flatbuffers::Vector64<uint8_t >::size_type>(buffer_size);
380
+ for (int i = tensor_data.size () - 1 ; i >= 0 ; --i) {
381
+ builder64_.PushBytes (tensor_data[i]->Data (), tensor_data[i]->size ());
382
+ }
383
+ auto offset_tensor = flatbuffers::Offset64<flatbuffers::Vector64<uint8_t >>(
384
+ builder64_.EndVector <
385
+ flatbuffers::Vector64<uint8_t >::size_type,
386
+ flatbuffers::Offset64<flatbuffers::Vector64<uint8_t >>::offset_type>(
387
+ buffer_size));
319
388
// add signature to binary for cache reuse in runtime
320
- builder_.Reset ();
321
- auto binary_info = qnn_delegate::CreateBinaryInfoDirect (
322
- builder_, signature ().c_str (), &data);
323
- builder_.Finish (binary_info);
389
+ auto offset_signature = builder64_.CreateString (signature ().c_str ());
390
+ // build binary info
391
+ auto binary_info = qnn_delegate::CreateBinaryInfo (
392
+ builder64_, offset_signature, offset_context, offset_tensor);
393
+ builder64_.Finish (binary_info);
324
394
325
395
return QnnExecuTorchContextBinary (
326
- {builder_.GetBufferPointer (), builder_.GetSize ()});
396
+ {builder64_.GetBufferPointer (), builder64_.GetSize ()});
397
+ }
398
+
399
+ QnnExecuTorchContextBinary MakeBinaryInfo (
400
+ const QnnExecuTorchContextBinary& ctx_bin,
401
+ const std::vector<uint8_t >& tensor_data) {
402
+ // the build order matters, 64 bit data is required to be shipped first
403
+ // add context data
404
+ builder64_.Reset ();
405
+
406
+ auto offset_context = builder64_.CreateVector <
407
+ uint8_t ,
408
+ flatbuffers::Offset64,
409
+ flatbuffers::Vector64>(
410
+ static_cast <const uint8_t *>(ctx_bin.buffer ), ctx_bin.nbytes );
411
+ // add tensor data
412
+ auto offset_tensor = builder64_.CreateVector <
413
+ uint8_t ,
414
+ flatbuffers::Offset64,
415
+ flatbuffers::Vector64>(
416
+ static_cast <const uint8_t *>(tensor_data.data ()), tensor_data.size ());
417
+ // add signature to binary for cache reuse in runtime
418
+ auto offset_signature = builder64_.CreateString (signature ().c_str ());
419
+ // build binary info
420
+ auto binary_info = qnn_delegate::CreateBinaryInfo (
421
+ builder64_, offset_signature, offset_context, offset_tensor);
422
+ builder64_.Finish (binary_info);
423
+
424
+ return QnnExecuTorchContextBinary (
425
+ {builder64_.GetBufferPointer (), builder64_.GetSize ()});
327
426
}
328
427
329
428
// Store the bytes object instead of a raw pointer so that this module will
330
429
// keep the bytes alive.
331
430
const py::bytes qnn_executorch_option_ptr_;
332
431
QnnExecuTorchContextBinary qnn_executorch_context_binary_;
333
432
std::shared_ptr<QnnManager> qnn_manager_;
433
+ flatbuffers::FlatBufferBuilder64 builder64_;
334
434
flatbuffers::FlatBufferBuilder builder_;
435
+ flatbuffers::Verifier::Options fb_opt_;
335
436
};
336
437
} // namespace qnn
337
438
} // namespace backends
0 commit comments