Skip to content

Commit 1997683

Browse files
haowhsu-quichaowhsu
authored andcommitted
Qualcomm AI Engine Direct - enable loading context binary directly
Summary: - add utilities for loading context binary generated from qnn tools - align env variable naming with qnn - fix bug in online prepare and extend coverage to support bitwise quatization - llama7b e2e example from qualcomm ai_hub - minor fixes for syle & typo
1 parent 740a0a5 commit 1997683

29 files changed

+2089
-160
lines changed

backends/qualcomm/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,9 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
235235
set_target_properties(
236236
PyQnnManagerAdaptor PROPERTIES CXX_VISIBILITY_PRESET hidden
237237
)
238+
set_target_properties(
239+
PyQnnWrapperAdaptor PROPERTIES CXX_VISIBILITY_PRESET hidden
240+
)
238241

239242
target_link_libraries(
240243
PyQnnManagerAdaptor

backends/qualcomm/aot/ir/qcir.fbs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,8 @@ enum QuantizeDef : byte {
5252
enum QuantizeType : byte {
5353
SCALE_OFFSET = 0,
5454
AXIS_SCALE_OFFSET,
55-
// TODO: enable
56-
// QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET
57-
// QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET
55+
BW_SCALE_OFFSET,
56+
BW_AXIS_SCALE_OFFSET,
5857
UNDEFINED,
5958
}
6059

@@ -66,7 +65,12 @@ struct ScaleOffset {
6665
table QuantizeParam {
6766
def: QuantizeDef;
6867
type: QuantizeType;
68+
bitwidth: uint;
6969
axis: int;
70+
// used by bitwidth quantization
71+
scales: [float];
72+
offsets: [int];
73+
// used by general quantization
7074
data: [ScaleOffset];
7175
}
7276

backends/qualcomm/aot/ir/qcir_utils.cpp

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,11 @@ qcir::DataType ToDataType(Qnn_DataType_t type) {
5555
{QNN_DATATYPE_FLOAT_16, qcir::DataType::FLOAT16},
5656
{QNN_DATATYPE_FLOAT_32, qcir::DataType::FLOAT32},
5757
// {QNN_DATATYPE_FLOAT_64, qcir::DataType::FLOAT64},
58-
// {QNN_DATATYPE_SFIXED_POINT_4, qcir::DataType::SFIXED4},
58+
{QNN_DATATYPE_SFIXED_POINT_4, qcir::DataType::SFIXED4},
5959
{QNN_DATATYPE_SFIXED_POINT_8, qcir::DataType::SFIXED8},
6060
{QNN_DATATYPE_SFIXED_POINT_16, qcir::DataType::SFIXED16},
6161
{QNN_DATATYPE_SFIXED_POINT_32, qcir::DataType::SFIXED32},
62-
// {QNN_DATATYPE_UFIXED_POINT_4, qcir::DataType::UFIXED4},
62+
{QNN_DATATYPE_UFIXED_POINT_4, qcir::DataType::UFIXED4},
6363
{QNN_DATATYPE_UFIXED_POINT_8, qcir::DataType::UFIXED8},
6464
{QNN_DATATYPE_UFIXED_POINT_16, qcir::DataType::UFIXED16},
6565
{QNN_DATATYPE_UFIXED_POINT_32, qcir::DataType::UFIXED32},
@@ -84,11 +84,11 @@ Qnn_DataType_t ToDataType(qcir::DataType type) {
8484
{qcir::DataType::FLOAT16, QNN_DATATYPE_FLOAT_16},
8585
{qcir::DataType::FLOAT32, QNN_DATATYPE_FLOAT_32},
8686
// {qcir::DataType::FLOAT64, QNN_DATATYPE_FLOAT_64},
87-
// {qcir::DataType::SFIXED4, QNN_DATATYPE_SFIXED_POINT_4},
87+
{qcir::DataType::SFIXED4, QNN_DATATYPE_SFIXED_POINT_4},
8888
{qcir::DataType::SFIXED8, QNN_DATATYPE_SFIXED_POINT_8},
8989
{qcir::DataType::SFIXED16, QNN_DATATYPE_SFIXED_POINT_16},
9090
{qcir::DataType::SFIXED32, QNN_DATATYPE_SFIXED_POINT_32},
91-
// {qcir::DataType::UFIXED4, QNN_DATATYPE_UFIXED_POINT_4},
91+
{qcir::DataType::UFIXED4, QNN_DATATYPE_UFIXED_POINT_4},
9292
{qcir::DataType::UFIXED8, QNN_DATATYPE_UFIXED_POINT_8},
9393
{qcir::DataType::UFIXED16, QNN_DATATYPE_UFIXED_POINT_16},
9494
{qcir::DataType::UFIXED32, QNN_DATATYPE_UFIXED_POINT_32},
@@ -114,13 +114,20 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
114114
qcir::QuantizeType::SCALE_OFFSET},
115115
{QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET,
116116
qcir::QuantizeType::AXIS_SCALE_OFFSET},
117+
{QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET,
118+
qcir::QuantizeType::BW_SCALE_OFFSET},
119+
{QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET,
120+
qcir::QuantizeType::BW_AXIS_SCALE_OFFSET},
117121
{QNN_QUANTIZATION_ENCODING_UNDEFINED,
118122
qcir::QuantizeType::UNDEFINED},
119123
};
120124

121-
int axis = 0;
125+
int32_t axis = 0;
126+
uint32_t bitwidth = 0;
122127
auto quant_type = type_map.at(param.quantizationEncoding);
123128
std::vector<qcir::ScaleOffset> data;
129+
std::vector<float> scales;
130+
std::vector<int32_t> offsets;
124131
switch (quant_type) {
125132
case qcir::QuantizeType::SCALE_OFFSET: {
126133
data.emplace_back(qcir::ScaleOffset(
@@ -135,11 +142,33 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
135142
param.axisScaleOffsetEncoding.scaleOffset[i].offset));
136143
}
137144
} break;
145+
case qcir::QuantizeType::BW_SCALE_OFFSET: {
146+
bitwidth = param.bwScaleOffsetEncoding.bitwidth;
147+
scales.push_back(param.bwScaleOffsetEncoding.scale);
148+
offsets.push_back(param.bwScaleOffsetEncoding.offset);
149+
} break;
150+
case qcir::QuantizeType::BW_AXIS_SCALE_OFFSET: {
151+
bitwidth = param.bwAxisScaleOffsetEncoding.bitwidth;
152+
axis = param.bwAxisScaleOffsetEncoding.axis;
153+
size_t len = param.bwAxisScaleOffsetEncoding.numElements;
154+
for (size_t i = 0; i < len; ++i) {
155+
scales.push_back(param.bwAxisScaleOffsetEncoding.scales[i]);
156+
offsets.push_back(param.bwAxisScaleOffsetEncoding.offsets[i]);
157+
}
158+
} break;
138159
default:
160+
QNN_EXECUTORCH_LOG_ERROR("QNN_QUANTIZATION_ENCODING_UNDEFINED detected");
139161
break;
140162
}
141163
return CreateQuantizeParamDirect(
142-
*builder, def_map.at(param.encodingDefinition), quant_type, axis, &data);
164+
*builder,
165+
def_map.at(param.encodingDefinition),
166+
quant_type,
167+
bitwidth,
168+
axis,
169+
&scales,
170+
&offsets,
171+
&data);
143172
}
144173

145174
Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
@@ -155,6 +184,10 @@ Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
155184
QNN_QUANTIZATION_ENCODING_SCALE_OFFSET},
156185
{qcir::QuantizeType::AXIS_SCALE_OFFSET,
157186
QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET},
187+
{qcir::QuantizeType::BW_SCALE_OFFSET,
188+
QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET},
189+
{qcir::QuantizeType::BW_AXIS_SCALE_OFFSET,
190+
QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET},
158191
{qcir::QuantizeType::UNDEFINED,
159192
QNN_QUANTIZATION_ENCODING_UNDEFINED},
160193
};
@@ -174,7 +207,22 @@ Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
174207
reinterpret_cast<Qnn_ScaleOffset_t*>(
175208
const_cast<uint8_t*>(param->data()->Data()));
176209
} break;
210+
case QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET: {
211+
p.bwAxisScaleOffsetEncoding.bitwidth = param->bitwidth();
212+
p.bwScaleOffsetEncoding.scale = param->scales()->Get(0);
213+
p.bwScaleOffsetEncoding.offset = param->offsets()->Get(0);
214+
} break;
215+
case QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET: {
216+
p.bwAxisScaleOffsetEncoding.bitwidth = param->bitwidth();
217+
p.bwAxisScaleOffsetEncoding.axis = param->axis();
218+
p.bwAxisScaleOffsetEncoding.numElements = param->scales()->size();
219+
p.bwAxisScaleOffsetEncoding.scales = reinterpret_cast<float*>(
220+
const_cast<uint8_t*>(param->scales()->Data()));
221+
p.bwAxisScaleOffsetEncoding.offsets = reinterpret_cast<int32_t*>(
222+
const_cast<uint8_t*>(param->offsets()->Data()));
223+
} break;
177224
default:
225+
QNN_EXECUTORCH_LOG_ERROR("qcir::QuantizeType::UNDEFINED detected");
178226
break;
179227
}
180228
return p;

backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,16 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) {
2626

2727
py::class_<PyQnnManager, std::shared_ptr<PyQnnManager>>(m, "QnnManager")
2828
.def(py::init<const py::bytes&>())
29+
.def(py::init<const py::bytes&, const py::bytes&>())
2930
.def("Init", &PyQnnManager::Init)
3031
.def("IsNodeSupportedByBackend", &PyQnnManager::IsNodeSupportedByBackend)
3132
.def("Compile", &PyQnnManager::Compile)
3233
.def("Destroy", &PyQnnManager::Destroy)
3334
.def("IsAvailable", &PyQnnManager::IsAvailable)
34-
.def("IsTensorDump", &PyQnnManager::IsTensorDump);
35+
.def("IsTensorDump", &PyQnnManager::IsTensorDump)
36+
.def("AllocateTensor", &PyQnnManager::AllocateTensor)
37+
.def("GetGraphInputs", &PyQnnManager::GetGraphInputs)
38+
.def("GetGraphOutputs", &PyQnnManager::GetGraphOutputs);
3539
}
3640
} // namespace qnn
3741
} // namespace executor

backends/qualcomm/aot/python/PyQnnManagerAdaptor.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
*/
88
#pragma once
99
#include <executorch/backends/qualcomm/aot/ir/qcir_utils.h>
10+
#include <executorch/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h>
1011
#include <executorch/backends/qualcomm/runtime/Logging.h>
1112
#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
1213
#include <executorch/backends/qualcomm/runtime/QnnManager.h>
@@ -23,6 +24,7 @@ namespace executor {
2324
namespace qnn {
2425
class PyQnnManager {
2526
public:
27+
// used for AoT compilation
2628
explicit PyQnnManager(const py::bytes& buffer)
2729
: qnn_executorch_option_ptr_(buffer),
2830
qnn_executorch_context_binary_(QNN_EXECUTORCH_CONTEXT_BINARY) {
@@ -33,6 +35,18 @@ class PyQnnManager {
3335
qnn_manager_ = std::make_shared<QnnManager>(
3436
qnn_executorch_options, qnn_executorch_context_binary_);
3537
}
38+
// used for loading context binary directly
39+
explicit PyQnnManager(const py::bytes& buffer, const py::bytes& ctx_bin)
40+
: qnn_executorch_option_ptr_(buffer) {
41+
auto qnn_executorch_options = GetQnnExecuTorchOptions(
42+
qnn_executorch_option_ptr_.cast<std::string_view>().data());
43+
44+
py::buffer_info info(py::buffer(ctx_bin).request());
45+
qnn_executorch_context_binary_.buffer = static_cast<void*>(info.ptr);
46+
qnn_executorch_context_binary_.nbytes = info.size * info.itemsize;
47+
qnn_manager_ = std::make_shared<QnnManager>(
48+
qnn_executorch_options, qnn_executorch_context_binary_);
49+
}
3650

3751
Error Init() {
3852
return qnn_manager_->Init();
@@ -141,6 +155,28 @@ class PyQnnManager {
141155
return qnn_manager_->IsTensorDump();
142156
}
143157

158+
Error AllocateTensor() {
159+
return qnn_manager_->AllocateTensor();
160+
}
161+
162+
py::list GetGraphInputs() {
163+
py::list ret;
164+
for (const std::shared_ptr<TensorWrapper>& input :
165+
qnn_manager_->GetGraphInputs()) {
166+
ret.append(PyQnnTensorWrapper(input));
167+
}
168+
return ret;
169+
}
170+
171+
py::list GetGraphOutputs() {
172+
py::list ret;
173+
for (const std::shared_ptr<TensorWrapper>& output :
174+
qnn_manager_->GetGraphOutputs()) {
175+
ret.append(PyQnnTensorWrapper(output));
176+
}
177+
return ret;
178+
}
179+
144180
private:
145181
// Store the bytes object instead of a raw pointer so that this module will
146182
// keep the bytes alive.

backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ std::shared_ptr<TensorWrapper> CreateTensorWrapper(
104104
}
105105

106106
PYBIND11_MODULE(PyQnnWrapperAdaptor, m) {
107+
PYBIND11_NUMPY_DTYPE(PyQnnTensorWrapper::EncodingData, scale, offset);
108+
107109
py::enum_<Qnn_TensorType_t>(m, "Qnn_TensorType_t")
108110
.value(
109111
"QNN_TENSOR_TYPE_APP_WRITE",
@@ -234,6 +236,18 @@ PYBIND11_MODULE(PyQnnWrapperAdaptor, m) {
234236
"GetOpWrapper",
235237
&PyQnnOpWrapper::GetOpWrapper,
236238
"A function which get op wrapper");
239+
240+
py::class_<PyQnnTensorWrapper::Encoding>(m, "Encoding")
241+
.def_readonly("data", &PyQnnTensorWrapper::Encoding::data)
242+
.def_readonly("axis", &PyQnnTensorWrapper::Encoding::axis);
243+
244+
py::class_<PyQnnTensorWrapper, std::shared_ptr<PyQnnTensorWrapper>>(
245+
m, "PyQnnTensorWrapper")
246+
.def(py::init<const std::shared_ptr<TensorWrapper>&>())
247+
.def("GetDims", &PyQnnTensorWrapper::GetDims)
248+
.def("GetDataType", &PyQnnTensorWrapper::GetDataType)
249+
.def("GetName", &PyQnnTensorWrapper::GetName)
250+
.def("GetEncodings", &PyQnnTensorWrapper::GetEncodings);
237251
}
238252
} // namespace qnn
239253
} // namespace executor

backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,8 @@ class PyQnnOpWrapper {
8585
name, data_type, attrData["data"].cast<bool>());
8686
break;
8787
default:
88-
QNN_EXECUTORCH_LOG_ERROR("tensor.v1.name: %d", data_type);
88+
QNN_EXECUTORCH_LOG_ERROR(
89+
"%s has invalid data type: %d", name, data_type);
8990
break;
9091
}
9192
}
@@ -96,6 +97,91 @@ class PyQnnOpWrapper {
9697
private:
9798
std::shared_ptr<OpWrapper> op_wrapper_;
9899
};
100+
101+
class PyQnnTensorWrapper {
102+
public:
103+
explicit PyQnnTensorWrapper(const std::shared_ptr<TensorWrapper>& wrapper) {
104+
tensor_wrapper_ = wrapper;
105+
}
106+
struct EncodingData {
107+
float scale;
108+
int32_t offset;
109+
};
110+
struct Encoding {
111+
py::array_t<EncodingData> data;
112+
int32_t axis;
113+
};
114+
115+
py::array_t<std::uint32_t> GetDims() {
116+
std::uint32_t* dim = tensor_wrapper_->GetDims();
117+
size_t shape[1]{tensor_wrapper_->GetRank()};
118+
size_t stride[1]{sizeof(std::uint32_t)};
119+
auto ret = py::array_t<std::uint32_t>(shape, stride);
120+
auto view = ret.mutable_unchecked<1>();
121+
for (int i = 0; i < ret.shape(0); ++i) {
122+
view(i) = dim[i];
123+
}
124+
return ret;
125+
}
126+
std::string GetName() {
127+
return tensor_wrapper_->GetName();
128+
}
129+
Qnn_DataType_t GetDataType() {
130+
return tensor_wrapper_->GetDataType();
131+
}
132+
Encoding GetEncodings() {
133+
auto q_param = tensor_wrapper_->GetQuantizeParams();
134+
size_t stride[1]{sizeof(EncodingData)};
135+
136+
switch (q_param.quantizationEncoding) {
137+
case QNN_QUANTIZATION_ENCODING_SCALE_OFFSET: {
138+
Qnn_ScaleOffset_t data = q_param.scaleOffsetEncoding;
139+
size_t shape[1]{1};
140+
auto enc_data = py::array_t<EncodingData>(shape, stride);
141+
auto view = enc_data.mutable_unchecked<1>();
142+
view(0) = {data.scale, data.offset};
143+
return {enc_data, -1};
144+
}
145+
case QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET: {
146+
Qnn_AxisScaleOffset_t data = q_param.axisScaleOffsetEncoding;
147+
size_t shape[1]{data.numScaleOffsets};
148+
auto enc_data = py::array_t<EncodingData>(shape, stride);
149+
auto view = enc_data.mutable_unchecked<1>();
150+
for (int i = 0; i < enc_data.shape(0); ++i) {
151+
view(i) = {data.scaleOffset[i].scale, data.scaleOffset[i].offset};
152+
}
153+
return {enc_data, data.axis};
154+
}
155+
case QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET: {
156+
Qnn_BwScaleOffset_t data = q_param.bwScaleOffsetEncoding;
157+
size_t shape[1]{1};
158+
auto enc_data = py::array_t<EncodingData>(shape, stride);
159+
auto view = enc_data.mutable_unchecked<1>();
160+
view(0) = {data.scale, data.offset};
161+
return {enc_data, -1};
162+
}
163+
case QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET: {
164+
Qnn_BwAxisScaleOffset_t data = q_param.bwAxisScaleOffsetEncoding;
165+
size_t shape[1]{data.numElements};
166+
auto enc_data = py::array_t<EncodingData>(shape, stride);
167+
auto view = enc_data.mutable_unchecked<1>();
168+
for (int i = 0; i < enc_data.shape(0); ++i) {
169+
view(i) = {data.scales[i], data.offsets[i]};
170+
}
171+
return {enc_data, data.axis};
172+
}
173+
default:
174+
QNN_EXECUTORCH_LOG_ERROR(
175+
"%s QNN_QUANTIZATION_ENCODING_UNDEFINED detected",
176+
GetName().c_str());
177+
break;
178+
}
179+
return {};
180+
}
181+
182+
private:
183+
std::shared_ptr<TensorWrapper> tensor_wrapper_;
184+
};
99185
} // namespace qnn
100186
} // namespace executor
101187
} // namespace torch

backends/qualcomm/aot/wrappers/TensorWrapper.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ class TensorWrapper {
7575
return QNN_VER_PTR(tensor_)->memType;
7676
};
7777

78+
Qnn_QuantizeParams_t GetQuantizeParams() const {
79+
return QNN_VER_PTR(tensor_)->quantizeParams;
80+
}
81+
7882
std::string GetName() const {
7983
return qnn_tensor_name_;
8084
};

backends/qualcomm/builders/qnn_constants.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@ class OpConcat:
3030
param_axis: str = "axis"
3131

3232

33+
@dataclass(init=False, frozen=True)
34+
class OpContextLoader:
35+
namespace: str = "qaisw"
36+
meta_ctx_bin: str = "qnn_context_binary"
37+
38+
3339
@dataclass(init=False, frozen=True)
3440
class OpConv2d:
3541
op_name: str = "Conv2d"

0 commit comments

Comments
 (0)