Skip to content

Commit b655ce1

Browse files
committed
Qualcomm AI Engine Direct - enable loading context binary directly
Summary: - add utilities for loading context binary generated from qnn tools - align env variable naming with qnn - fix bug in online prepare and extend coverage to support bitwise quatization - llama7b e2e example from qualcomm ai_hub - minor fixes for syle & typo
1 parent 5584b9e commit b655ce1

30 files changed

+2103
-164
lines changed

backends/qualcomm/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,9 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
235235
set_target_properties(
236236
PyQnnManagerAdaptor PROPERTIES CXX_VISIBILITY_PRESET hidden
237237
)
238+
set_target_properties(
239+
PyQnnWrapperAdaptor PROPERTIES CXX_VISIBILITY_PRESET hidden
240+
)
238241

239242
target_link_libraries(
240243
PyQnnManagerAdaptor

backends/qualcomm/aot/ir/qcir.fbs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,8 @@ enum QuantizeDef : byte {
5252
enum QuantizeType : byte {
5353
SCALE_OFFSET = 0,
5454
AXIS_SCALE_OFFSET,
55-
// TODO: enable
56-
// QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET
57-
// QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET
55+
BW_SCALE_OFFSET,
56+
BW_AXIS_SCALE_OFFSET,
5857
UNDEFINED,
5958
}
6059

@@ -66,7 +65,12 @@ struct ScaleOffset {
6665
table QuantizeParam {
6766
def: QuantizeDef;
6867
type: QuantizeType;
68+
bitwidth: int;
6969
axis: int;
70+
// used by bitwidth quantization
71+
scales: [float];
72+
offsets: [int];
73+
// used by general quantization
7074
data: [ScaleOffset];
7175
}
7276

backends/qualcomm/aot/ir/qcir_utils.cpp

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,11 @@ qcir::DataType ToDataType(Qnn_DataType_t type) {
5555
{QNN_DATATYPE_FLOAT_16, qcir::DataType::FLOAT16},
5656
{QNN_DATATYPE_FLOAT_32, qcir::DataType::FLOAT32},
5757
// {QNN_DATATYPE_FLOAT_64, qcir::DataType::FLOAT64},
58-
// {QNN_DATATYPE_SFIXED_POINT_4, qcir::DataType::SFIXED4},
58+
{QNN_DATATYPE_SFIXED_POINT_4, qcir::DataType::SFIXED4},
5959
{QNN_DATATYPE_SFIXED_POINT_8, qcir::DataType::SFIXED8},
6060
{QNN_DATATYPE_SFIXED_POINT_16, qcir::DataType::SFIXED16},
6161
{QNN_DATATYPE_SFIXED_POINT_32, qcir::DataType::SFIXED32},
62-
// {QNN_DATATYPE_UFIXED_POINT_4, qcir::DataType::UFIXED4},
62+
{QNN_DATATYPE_UFIXED_POINT_4, qcir::DataType::UFIXED4},
6363
{QNN_DATATYPE_UFIXED_POINT_8, qcir::DataType::UFIXED8},
6464
{QNN_DATATYPE_UFIXED_POINT_16, qcir::DataType::UFIXED16},
6565
{QNN_DATATYPE_UFIXED_POINT_32, qcir::DataType::UFIXED32},
@@ -84,11 +84,11 @@ Qnn_DataType_t ToDataType(qcir::DataType type) {
8484
{qcir::DataType::FLOAT16, QNN_DATATYPE_FLOAT_16},
8585
{qcir::DataType::FLOAT32, QNN_DATATYPE_FLOAT_32},
8686
// {qcir::DataType::FLOAT64, QNN_DATATYPE_FLOAT_64},
87-
// {qcir::DataType::SFIXED4, QNN_DATATYPE_SFIXED_POINT_4},
87+
{qcir::DataType::SFIXED4, QNN_DATATYPE_SFIXED_POINT_4},
8888
{qcir::DataType::SFIXED8, QNN_DATATYPE_SFIXED_POINT_8},
8989
{qcir::DataType::SFIXED16, QNN_DATATYPE_SFIXED_POINT_16},
9090
{qcir::DataType::SFIXED32, QNN_DATATYPE_SFIXED_POINT_32},
91-
// {qcir::DataType::UFIXED4, QNN_DATATYPE_UFIXED_POINT_4},
91+
{qcir::DataType::UFIXED4, QNN_DATATYPE_UFIXED_POINT_4},
9292
{qcir::DataType::UFIXED8, QNN_DATATYPE_UFIXED_POINT_8},
9393
{qcir::DataType::UFIXED16, QNN_DATATYPE_UFIXED_POINT_16},
9494
{qcir::DataType::UFIXED32, QNN_DATATYPE_UFIXED_POINT_32},
@@ -114,13 +114,19 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
114114
qcir::QuantizeType::SCALE_OFFSET},
115115
{QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET,
116116
qcir::QuantizeType::AXIS_SCALE_OFFSET},
117+
{QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET,
118+
qcir::QuantizeType::BW_SCALE_OFFSET},
119+
{QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET,
120+
qcir::QuantizeType::BW_AXIS_SCALE_OFFSET},
117121
{QNN_QUANTIZATION_ENCODING_UNDEFINED,
118122
qcir::QuantizeType::UNDEFINED},
119123
};
120124

121-
int axis = 0;
125+
int axis = 0, bitwidth = 0;
122126
auto quant_type = type_map.at(param.quantizationEncoding);
123127
std::vector<qcir::ScaleOffset> data;
128+
std::vector<float> scales;
129+
std::vector<int32_t> offsets;
124130
switch (quant_type) {
125131
case qcir::QuantizeType::SCALE_OFFSET: {
126132
data.emplace_back(qcir::ScaleOffset(
@@ -135,11 +141,32 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
135141
param.axisScaleOffsetEncoding.scaleOffset[i].offset));
136142
}
137143
} break;
144+
case qcir::QuantizeType::BW_SCALE_OFFSET: {
145+
bitwidth = param.bwScaleOffsetEncoding.bitwidth;
146+
scales.push_back(param.bwScaleOffsetEncoding.scale);
147+
offsets.push_back(param.bwScaleOffsetEncoding.offset);
148+
} break;
149+
case qcir::QuantizeType::BW_AXIS_SCALE_OFFSET: {
150+
bitwidth = param.bwAxisScaleOffsetEncoding.bitwidth;
151+
axis = param.bwAxisScaleOffsetEncoding.axis;
152+
size_t len = param.bwAxisScaleOffsetEncoding.numElements;
153+
for (uint i = 0; i < len; ++i) {
154+
scales.push_back(param.bwAxisScaleOffsetEncoding.scales[i]);
155+
offsets.push_back(param.bwAxisScaleOffsetEncoding.offsets[i]);
156+
}
157+
} break;
138158
default:
139159
break;
140160
}
141161
return CreateQuantizeParamDirect(
142-
*builder, def_map.at(param.encodingDefinition), quant_type, axis, &data);
162+
*builder,
163+
def_map.at(param.encodingDefinition),
164+
quant_type,
165+
bitwidth,
166+
axis,
167+
&scales,
168+
&offsets,
169+
&data);
143170
}
144171

145172
Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
@@ -155,6 +182,10 @@ Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
155182
QNN_QUANTIZATION_ENCODING_SCALE_OFFSET},
156183
{qcir::QuantizeType::AXIS_SCALE_OFFSET,
157184
QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET},
185+
{qcir::QuantizeType::BW_SCALE_OFFSET,
186+
QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET},
187+
{qcir::QuantizeType::BW_AXIS_SCALE_OFFSET,
188+
QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET},
158189
{qcir::QuantizeType::UNDEFINED,
159190
QNN_QUANTIZATION_ENCODING_UNDEFINED},
160191
};
@@ -174,6 +205,20 @@ Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
174205
reinterpret_cast<Qnn_ScaleOffset_t*>(
175206
const_cast<uint8_t*>(param->data()->Data()));
176207
} break;
208+
case QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET: {
209+
p.bwAxisScaleOffsetEncoding.bitwidth = param->bitwidth();
210+
p.bwScaleOffsetEncoding.scale = param->scales()->Get(0);
211+
p.bwScaleOffsetEncoding.offset = param->offsets()->Get(0);
212+
} break;
213+
case QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET: {
214+
p.bwAxisScaleOffsetEncoding.bitwidth = param->bitwidth();
215+
p.bwAxisScaleOffsetEncoding.axis = param->axis();
216+
p.bwAxisScaleOffsetEncoding.numElements = param->scales()->size();
217+
p.bwAxisScaleOffsetEncoding.scales = reinterpret_cast<float*>(
218+
const_cast<uint8_t*>(param->scales()->Data()));
219+
p.bwAxisScaleOffsetEncoding.offsets = reinterpret_cast<int32_t*>(
220+
const_cast<uint8_t*>(param->offsets()->Data()));
221+
} break;
177222
default:
178223
break;
179224
}

backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,16 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) {
2626

2727
py::class_<PyQnnManager, std::shared_ptr<PyQnnManager>>(m, "QnnManager")
2828
.def(py::init<const py::bytes&>())
29+
.def(py::init<const py::bytes&, const py::bytes&>())
2930
.def("Init", &PyQnnManager::Init)
3031
.def("IsNodeSupportedByBackend", &PyQnnManager::IsNodeSupportedByBackend)
3132
.def("Compile", &PyQnnManager::Compile)
3233
.def("Destroy", &PyQnnManager::Destroy)
3334
.def("IsAvailable", &PyQnnManager::IsAvailable)
34-
.def("IsTensorDump", &PyQnnManager::IsTensorDump);
35+
.def("IsTensorDump", &PyQnnManager::IsTensorDump)
36+
.def("AllocateTensor", &PyQnnManager::AllocateTensor)
37+
.def("GetGraphInputs", &PyQnnManager::GetGraphInputs)
38+
.def("GetGraphOutputs", &PyQnnManager::GetGraphOutputs);
3539
}
3640
} // namespace qnn
3741
} // namespace executor

backends/qualcomm/aot/python/PyQnnManagerAdaptor.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
*/
88
#pragma once
99
#include <executorch/backends/qualcomm/aot/ir/qcir_utils.h>
10+
#include <executorch/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h>
1011
#include <executorch/backends/qualcomm/runtime/Logging.h>
1112
#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
1213
#include <executorch/backends/qualcomm/runtime/QnnManager.h>
@@ -23,6 +24,7 @@ namespace executor {
2324
namespace qnn {
2425
class PyQnnManager {
2526
public:
27+
// used for AoT compilation
2628
explicit PyQnnManager(const py::bytes& buffer)
2729
: qnn_executorch_option_ptr_(buffer),
2830
qnn_executorch_context_binary_(QNN_EXECUTORCH_CONTEXT_BINARY) {
@@ -33,6 +35,18 @@ class PyQnnManager {
3335
qnn_manager_ = std::make_shared<QnnManager>(
3436
qnn_executorch_options, qnn_executorch_context_binary_);
3537
}
38+
// used for loading context binary directly
39+
explicit PyQnnManager(const py::bytes& buffer, const py::bytes& ctx_bin)
40+
: qnn_executorch_option_ptr_(buffer) {
41+
auto qnn_executorch_options = GetQnnExecuTorchOptions(
42+
qnn_executorch_option_ptr_.cast<std::string_view>().data());
43+
44+
py::buffer_info info(py::buffer(ctx_bin).request());
45+
qnn_executorch_context_binary_.buffer = static_cast<void*>(info.ptr);
46+
qnn_executorch_context_binary_.nbytes = info.size * info.itemsize;
47+
qnn_manager_ = std::make_shared<QnnManager>(
48+
qnn_executorch_options, qnn_executorch_context_binary_);
49+
}
3650

3751
Error Init() {
3852
return qnn_manager_->Init();
@@ -141,6 +155,28 @@ class PyQnnManager {
141155
return qnn_manager_->IsTensorDump();
142156
}
143157

158+
Error AllocateTensor() {
159+
return qnn_manager_->AllocateTensor();
160+
}
161+
162+
py::list GetGraphInputs() {
163+
py::list ret;
164+
for (std::shared_ptr<TensorWrapper>& input :
165+
qnn_manager_->GetGraphInputs()) {
166+
ret.append(PyQnnTensorWrapper(input));
167+
}
168+
return ret;
169+
}
170+
171+
py::list GetGraphOutputs() {
172+
py::list ret;
173+
for (std::shared_ptr<TensorWrapper>& output :
174+
qnn_manager_->GetGraphOutputs()) {
175+
ret.append(PyQnnTensorWrapper(output));
176+
}
177+
return ret;
178+
}
179+
144180
private:
145181
// Store the bytes object instead of a raw pointer so that this module will
146182
// keep the bytes alive.

backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ std::shared_ptr<TensorWrapper> CreateTensorWrapper(
104104
}
105105

106106
PYBIND11_MODULE(PyQnnWrapperAdaptor, m) {
107+
PYBIND11_NUMPY_DTYPE(PyQnnTensorWrapper::EncodingData, scale, offset);
108+
107109
py::enum_<Qnn_TensorType_t>(m, "Qnn_TensorType_t")
108110
.value(
109111
"QNN_TENSOR_TYPE_APP_WRITE",
@@ -234,6 +236,18 @@ PYBIND11_MODULE(PyQnnWrapperAdaptor, m) {
234236
"GetOpWrapper",
235237
&PyQnnOpWrapper::GetOpWrapper,
236238
"A function which get op wrapper");
239+
240+
py::class_<PyQnnTensorWrapper::Encoding>(m, "Encoding")
241+
.def_readonly("data", &PyQnnTensorWrapper::Encoding::data)
242+
.def_readonly("axis", &PyQnnTensorWrapper::Encoding::axis);
243+
244+
py::class_<PyQnnTensorWrapper, std::shared_ptr<PyQnnTensorWrapper>>(
245+
m, "PyQnnTensorWrapper")
246+
.def(py::init<const std::shared_ptr<TensorWrapper>&>())
247+
.def("GetDims", &PyQnnTensorWrapper::GetDims)
248+
.def("GetDataType", &PyQnnTensorWrapper::GetDataType)
249+
.def("GetName", &PyQnnTensorWrapper::GetName)
250+
.def("GetEncodings", &PyQnnTensorWrapper::GetEncodings);
237251
}
238252
} // namespace qnn
239253
} // namespace executor

backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,88 @@ class PyQnnOpWrapper {
9696
private:
9797
std::shared_ptr<OpWrapper> op_wrapper_;
9898
};
99+
100+
class PyQnnTensorWrapper {
101+
public:
102+
explicit PyQnnTensorWrapper(const std::shared_ptr<TensorWrapper>& wrapper) {
103+
tensor_wrapper_ = wrapper;
104+
}
105+
struct EncodingData {
106+
float scale;
107+
int32_t offset;
108+
};
109+
struct Encoding {
110+
py::array_t<EncodingData> data;
111+
int32_t axis;
112+
};
113+
114+
py::array_t<std::uint32_t> GetDims() {
115+
std::uint32_t* dim = tensor_wrapper_->GetDims();
116+
size_t shape[1]{tensor_wrapper_->GetRank()};
117+
size_t stride[1]{sizeof(std::uint32_t)};
118+
auto ret = py::array_t<std::uint32_t>(shape, stride);
119+
auto view = ret.mutable_unchecked<1>();
120+
for (int i = 0; i < ret.shape(0); ++i) {
121+
view(i) = dim[i];
122+
}
123+
return ret;
124+
}
125+
std::string GetName() {
126+
return tensor_wrapper_->GetName();
127+
}
128+
Qnn_DataType_t GetDataType() {
129+
return tensor_wrapper_->GetDataType();
130+
}
131+
Encoding GetEncodings() {
132+
auto q_param = tensor_wrapper_->GetQuantizeParams();
133+
size_t stride[1]{sizeof(EncodingData)};
134+
135+
switch (q_param.quantizationEncoding) {
136+
case QNN_QUANTIZATION_ENCODING_SCALE_OFFSET: {
137+
Qnn_ScaleOffset_t data = q_param.scaleOffsetEncoding;
138+
size_t shape[1]{1};
139+
auto enc_data = py::array_t<EncodingData>(shape, stride);
140+
auto view = enc_data.mutable_unchecked<1>();
141+
view(0) = {data.scale, data.offset};
142+
return {enc_data, -1};
143+
}
144+
case QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET: {
145+
Qnn_AxisScaleOffset_t data = q_param.axisScaleOffsetEncoding;
146+
size_t shape[1]{data.numScaleOffsets};
147+
auto enc_data = py::array_t<EncodingData>(shape, stride);
148+
auto view = enc_data.mutable_unchecked<1>();
149+
for (int i = 0; i < enc_data.shape(0); ++i) {
150+
view(i) = {data.scaleOffset[i].scale, data.scaleOffset[i].offset};
151+
}
152+
return {enc_data, data.axis};
153+
}
154+
case QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET: {
155+
Qnn_BwScaleOffset_t data = q_param.bwScaleOffsetEncoding;
156+
size_t shape[1]{1};
157+
auto enc_data = py::array_t<EncodingData>(shape, stride);
158+
auto view = enc_data.mutable_unchecked<1>();
159+
view(0) = {data.scale, data.offset};
160+
return {enc_data, -1};
161+
}
162+
case QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET: {
163+
Qnn_BwAxisScaleOffset_t data = q_param.bwAxisScaleOffsetEncoding;
164+
size_t shape[1]{data.numElements};
165+
auto enc_data = py::array_t<EncodingData>(shape, stride);
166+
auto view = enc_data.mutable_unchecked<1>();
167+
for (int i = 0; i < enc_data.shape(0); ++i) {
168+
view(i) = {data.scales[i], data.offsets[i]};
169+
}
170+
return {enc_data, data.axis};
171+
}
172+
default:
173+
break;
174+
}
175+
return {};
176+
}
177+
178+
private:
179+
std::shared_ptr<TensorWrapper> tensor_wrapper_;
180+
};
99181
} // namespace qnn
100182
} // namespace executor
101183
} // namespace torch

backends/qualcomm/aot/wrappers/TensorWrapper.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ class TensorWrapper {
7575
return QNN_VER_PTR(tensor_)->memType;
7676
};
7777

78+
Qnn_QuantizeParams_t GetQuantizeParams() const {
79+
return QNN_VER_PTR(tensor_)->quantizeParams;
80+
}
81+
7882
std::string GetName() const {
7983
return qnn_tensor_name_;
8084
};

backends/qualcomm/builders/qnn_constants.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@ class OpConcat:
3030
param_axis: str = "axis"
3131

3232

33+
@dataclass(init=False, frozen=True)
34+
class OpContextLoader:
35+
namespace: str = "qaisw"
36+
meta_ctx_bin: str = "qnn_context_binary"
37+
38+
3339
@dataclass(init=False, frozen=True)
3440
class OpConv2d:
3541
op_name: str = "Conv2d"

0 commit comments

Comments
 (0)