Skip to content

Qualcomm AI Engine Direct - enable loading context binary directly #4163

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions backends/qualcomm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
set_target_properties(
PyQnnManagerAdaptor PROPERTIES CXX_VISIBILITY_PRESET hidden
)
set_target_properties(
PyQnnWrapperAdaptor PROPERTIES CXX_VISIBILITY_PRESET hidden
)

target_link_libraries(
PyQnnManagerAdaptor
Expand Down
10 changes: 7 additions & 3 deletions backends/qualcomm/aot/ir/qcir.fbs
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,8 @@ enum QuantizeDef : byte {
enum QuantizeType : byte {
SCALE_OFFSET = 0,
AXIS_SCALE_OFFSET,
// TODO: enable
// QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET
// QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET
BW_SCALE_OFFSET,
BW_AXIS_SCALE_OFFSET,
UNDEFINED,
}

Expand All @@ -66,7 +65,12 @@ struct ScaleOffset {
table QuantizeParam {
def: QuantizeDef;
type: QuantizeType;
bitwidth: uint;
axis: int;
// used by bitwidth quantization
scales: [float];
offsets: [int];
// used by general quantization
data: [ScaleOffset];
}

Expand Down
66 changes: 58 additions & 8 deletions backends/qualcomm/aot/ir/qcir_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@ qcir::DataType ToDataType(Qnn_DataType_t type) {
{QNN_DATATYPE_FLOAT_16, qcir::DataType::FLOAT16},
{QNN_DATATYPE_FLOAT_32, qcir::DataType::FLOAT32},
// {QNN_DATATYPE_FLOAT_64, qcir::DataType::FLOAT64},
// {QNN_DATATYPE_SFIXED_POINT_4, qcir::DataType::SFIXED4},
{QNN_DATATYPE_SFIXED_POINT_4, qcir::DataType::SFIXED4},
{QNN_DATATYPE_SFIXED_POINT_8, qcir::DataType::SFIXED8},
{QNN_DATATYPE_SFIXED_POINT_16, qcir::DataType::SFIXED16},
{QNN_DATATYPE_SFIXED_POINT_32, qcir::DataType::SFIXED32},
// {QNN_DATATYPE_UFIXED_POINT_4, qcir::DataType::UFIXED4},
{QNN_DATATYPE_UFIXED_POINT_4, qcir::DataType::UFIXED4},
{QNN_DATATYPE_UFIXED_POINT_8, qcir::DataType::UFIXED8},
{QNN_DATATYPE_UFIXED_POINT_16, qcir::DataType::UFIXED16},
{QNN_DATATYPE_UFIXED_POINT_32, qcir::DataType::UFIXED32},
Expand All @@ -84,11 +84,11 @@ Qnn_DataType_t ToDataType(qcir::DataType type) {
{qcir::DataType::FLOAT16, QNN_DATATYPE_FLOAT_16},
{qcir::DataType::FLOAT32, QNN_DATATYPE_FLOAT_32},
// {qcir::DataType::FLOAT64, QNN_DATATYPE_FLOAT_64},
// {qcir::DataType::SFIXED4, QNN_DATATYPE_SFIXED_POINT_4},
{qcir::DataType::SFIXED4, QNN_DATATYPE_SFIXED_POINT_4},
{qcir::DataType::SFIXED8, QNN_DATATYPE_SFIXED_POINT_8},
{qcir::DataType::SFIXED16, QNN_DATATYPE_SFIXED_POINT_16},
{qcir::DataType::SFIXED32, QNN_DATATYPE_SFIXED_POINT_32},
// {qcir::DataType::UFIXED4, QNN_DATATYPE_UFIXED_POINT_4},
{qcir::DataType::UFIXED4, QNN_DATATYPE_UFIXED_POINT_4},
{qcir::DataType::UFIXED8, QNN_DATATYPE_UFIXED_POINT_8},
{qcir::DataType::UFIXED16, QNN_DATATYPE_UFIXED_POINT_16},
{qcir::DataType::UFIXED32, QNN_DATATYPE_UFIXED_POINT_32},
Expand All @@ -114,13 +114,20 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
qcir::QuantizeType::SCALE_OFFSET},
{QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET,
qcir::QuantizeType::AXIS_SCALE_OFFSET},
{QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET,
qcir::QuantizeType::BW_SCALE_OFFSET},
{QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET,
qcir::QuantizeType::BW_AXIS_SCALE_OFFSET},
{QNN_QUANTIZATION_ENCODING_UNDEFINED,
qcir::QuantizeType::UNDEFINED},
};

int axis = 0;
int32_t axis = 0;
uint32_t bitwidth = 0;
auto quant_type = type_map.at(param.quantizationEncoding);
std::vector<qcir::ScaleOffset> data;
std::vector<float> scales;
std::vector<int32_t> offsets;
switch (quant_type) {
case qcir::QuantizeType::SCALE_OFFSET: {
data.emplace_back(qcir::ScaleOffset(
Expand All @@ -129,17 +136,42 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
case qcir::QuantizeType::AXIS_SCALE_OFFSET: {
size_t len = param.axisScaleOffsetEncoding.numScaleOffsets;
axis = param.axisScaleOffsetEncoding.axis;
data.reserve(len);
for (uint i = 0; i < len; ++i) {
data.emplace_back(qcir::ScaleOffset(
param.axisScaleOffsetEncoding.scaleOffset[i].scale,
param.axisScaleOffsetEncoding.scaleOffset[i].offset));
}
} break;
case qcir::QuantizeType::BW_SCALE_OFFSET: {
bitwidth = param.bwScaleOffsetEncoding.bitwidth;
scales.push_back(param.bwScaleOffsetEncoding.scale);
offsets.push_back(param.bwScaleOffsetEncoding.offset);
} break;
case qcir::QuantizeType::BW_AXIS_SCALE_OFFSET: {
bitwidth = param.bwAxisScaleOffsetEncoding.bitwidth;
axis = param.bwAxisScaleOffsetEncoding.axis;
size_t len = param.bwAxisScaleOffsetEncoding.numElements;
scales.reserve(len);
offsets.reserve(len);
for (size_t i = 0; i < len; ++i) {
scales.push_back(param.bwAxisScaleOffsetEncoding.scales[i]);
offsets.push_back(param.bwAxisScaleOffsetEncoding.offsets[i]);
}
} break;
default:
QNN_EXECUTORCH_LOG_ERROR("QNN_QUANTIZATION_ENCODING_UNDEFINED detected");
break;
}
return CreateQuantizeParamDirect(
*builder, def_map.at(param.encodingDefinition), quant_type, axis, &data);
*builder,
def_map.at(param.encodingDefinition),
quant_type,
bitwidth,
axis,
&scales,
&offsets,
&data);
}

Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
Expand All @@ -155,6 +187,10 @@ Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
QNN_QUANTIZATION_ENCODING_SCALE_OFFSET},
{qcir::QuantizeType::AXIS_SCALE_OFFSET,
QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET},
{qcir::QuantizeType::BW_SCALE_OFFSET,
QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET},
{qcir::QuantizeType::BW_AXIS_SCALE_OFFSET,
QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET},
{qcir::QuantizeType::UNDEFINED,
QNN_QUANTIZATION_ENCODING_UNDEFINED},
};
Expand All @@ -174,7 +210,22 @@ Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
reinterpret_cast<Qnn_ScaleOffset_t*>(
const_cast<uint8_t*>(param->data()->Data()));
} break;
case QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET: {
p.bwAxisScaleOffsetEncoding.bitwidth = param->bitwidth();
p.bwScaleOffsetEncoding.scale = param->scales()->Get(0);
p.bwScaleOffsetEncoding.offset = param->offsets()->Get(0);
} break;
case QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET: {
p.bwAxisScaleOffsetEncoding.bitwidth = param->bitwidth();
p.bwAxisScaleOffsetEncoding.axis = param->axis();
p.bwAxisScaleOffsetEncoding.numElements = param->scales()->size();
p.bwAxisScaleOffsetEncoding.scales =
const_cast<float*>(param->scales()->data());
p.bwAxisScaleOffsetEncoding.offsets =
const_cast<int32_t*>(param->offsets()->data());
} break;
default:
QNN_EXECUTORCH_LOG_ERROR("qcir::QuantizeType::UNDEFINED detected");
break;
}
return p;
Expand Down Expand Up @@ -212,8 +263,7 @@ Qnn_Tensor_t ToTensor(const tensor_type& tensor) {
QNN_VER_PTR(t)->dataType = ToDataType(tensor->dtype());
QNN_VER_PTR(t)->quantizeParams = ToQuantizeParam(tensor->qparam());
QNN_VER_PTR(t)->rank = tensor->shape()->size();
QNN_VER_PTR(t)->dimensions = reinterpret_cast<uint32_t*>(
const_cast<uint8_t*>(tensor->shape()->Data()));
QNN_VER_PTR(t)->dimensions = const_cast<uint32_t*>(tensor->shape()->data());
QNN_VER_PTR(t)->clientBuf.dataSize = tensor->data()->size();
QNN_VER_PTR(t)->clientBuf.data = is_io_tensor(QNN_VER_PTR(t)->type)
? nullptr
Expand Down
6 changes: 5 additions & 1 deletion backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,16 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) {

py::class_<PyQnnManager, std::shared_ptr<PyQnnManager>>(m, "QnnManager")
.def(py::init<const py::bytes&>())
.def(py::init<const py::bytes&, const py::bytes&>())
.def("Init", &PyQnnManager::Init)
.def("IsNodeSupportedByBackend", &PyQnnManager::IsNodeSupportedByBackend)
.def("Compile", &PyQnnManager::Compile)
.def("Destroy", &PyQnnManager::Destroy)
.def("IsAvailable", &PyQnnManager::IsAvailable)
.def("IsTensorDump", &PyQnnManager::IsTensorDump);
.def("IsTensorDump", &PyQnnManager::IsTensorDump)
.def("AllocateTensor", &PyQnnManager::AllocateTensor)
.def("GetGraphInputs", &PyQnnManager::GetGraphInputs)
.def("GetGraphOutputs", &PyQnnManager::GetGraphOutputs);
}
} // namespace qnn
} // namespace executor
Expand Down
36 changes: 36 additions & 0 deletions backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*/
#pragma once
#include <executorch/backends/qualcomm/aot/ir/qcir_utils.h>
#include <executorch/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h>
#include <executorch/backends/qualcomm/runtime/Logging.h>
#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
#include <executorch/backends/qualcomm/runtime/QnnManager.h>
Expand All @@ -23,6 +24,7 @@ namespace executor {
namespace qnn {
class PyQnnManager {
public:
// used for AoT compilation
explicit PyQnnManager(const py::bytes& buffer)
: qnn_executorch_option_ptr_(buffer),
qnn_executorch_context_binary_(QNN_EXECUTORCH_CONTEXT_BINARY) {
Expand All @@ -33,6 +35,18 @@ class PyQnnManager {
qnn_manager_ = std::make_shared<QnnManager>(
qnn_executorch_options, qnn_executorch_context_binary_);
}
// used for loading context binary directly
explicit PyQnnManager(const py::bytes& buffer, const py::bytes& ctx_bin)
: qnn_executorch_option_ptr_(buffer) {
auto qnn_executorch_options = GetQnnExecuTorchOptions(
qnn_executorch_option_ptr_.cast<std::string_view>().data());

py::buffer_info info(py::buffer(ctx_bin).request());
qnn_executorch_context_binary_.buffer = static_cast<void*>(info.ptr);
qnn_executorch_context_binary_.nbytes = info.size * info.itemsize;
qnn_manager_ = std::make_shared<QnnManager>(
qnn_executorch_options, qnn_executorch_context_binary_);
}

Error Init() {
return qnn_manager_->Init();
Expand Down Expand Up @@ -141,6 +155,28 @@ class PyQnnManager {
return qnn_manager_->IsTensorDump();
}

Error AllocateTensor() {
return qnn_manager_->AllocateTensor();
}

py::list GetGraphInputs() {
py::list ret;
for (const std::shared_ptr<TensorWrapper>& input :
qnn_manager_->GetGraphInputs()) {
ret.append(PyQnnTensorWrapper(input));
}
return ret;
}

py::list GetGraphOutputs() {
py::list ret;
for (const std::shared_ptr<TensorWrapper>& output :
qnn_manager_->GetGraphOutputs()) {
ret.append(PyQnnTensorWrapper(output));
}
return ret;
}

private:
// Store the bytes object instead of a raw pointer so that this module will
// keep the bytes alive.
Expand Down
14 changes: 14 additions & 0 deletions backends/qualcomm/aot/python/PyQnnWrapperAdaptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ std::shared_ptr<TensorWrapper> CreateTensorWrapper(
}

PYBIND11_MODULE(PyQnnWrapperAdaptor, m) {
PYBIND11_NUMPY_DTYPE(PyQnnTensorWrapper::EncodingData, scale, offset);

py::enum_<Qnn_TensorType_t>(m, "Qnn_TensorType_t")
.value(
"QNN_TENSOR_TYPE_APP_WRITE",
Expand Down Expand Up @@ -234,6 +236,18 @@ PYBIND11_MODULE(PyQnnWrapperAdaptor, m) {
"GetOpWrapper",
&PyQnnOpWrapper::GetOpWrapper,
"A function which get op wrapper");

py::class_<PyQnnTensorWrapper::Encoding>(m, "Encoding")
.def_readonly("data", &PyQnnTensorWrapper::Encoding::data)
.def_readonly("axis", &PyQnnTensorWrapper::Encoding::axis);

py::class_<PyQnnTensorWrapper, std::shared_ptr<PyQnnTensorWrapper>>(
m, "PyQnnTensorWrapper")
.def(py::init<const std::shared_ptr<TensorWrapper>&>())
.def("GetDims", &PyQnnTensorWrapper::GetDims)
.def("GetDataType", &PyQnnTensorWrapper::GetDataType)
.def("GetName", &PyQnnTensorWrapper::GetName)
.def("GetEncodings", &PyQnnTensorWrapper::GetEncodings);
}
} // namespace qnn
} // namespace executor
Expand Down
88 changes: 87 additions & 1 deletion backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ class PyQnnOpWrapper {
name, data_type, attrData["data"].cast<bool>());
break;
default:
QNN_EXECUTORCH_LOG_ERROR("tensor.v1.name: %d", data_type);
QNN_EXECUTORCH_LOG_ERROR(
"%s has invalid data type: %d", name, data_type);
break;
}
}
Expand All @@ -96,6 +97,91 @@ class PyQnnOpWrapper {
private:
std::shared_ptr<OpWrapper> op_wrapper_;
};

class PyQnnTensorWrapper {
public:
explicit PyQnnTensorWrapper(const std::shared_ptr<TensorWrapper>& wrapper) {
tensor_wrapper_ = wrapper;
}
struct EncodingData {
float scale;
int32_t offset;
};
struct Encoding {
py::array_t<EncodingData> data;
int32_t axis;
};

py::array_t<std::uint32_t> GetDims() {
std::uint32_t* dim = tensor_wrapper_->GetDims();
size_t shape[1]{tensor_wrapper_->GetRank()};
size_t stride[1]{sizeof(std::uint32_t)};
auto ret = py::array_t<std::uint32_t>(shape, stride);
auto view = ret.mutable_unchecked<1>();
for (int i = 0; i < ret.shape(0); ++i) {
view(i) = dim[i];
}
return ret;
}
std::string GetName() {
return tensor_wrapper_->GetName();
}
Qnn_DataType_t GetDataType() {
return tensor_wrapper_->GetDataType();
}
Encoding GetEncodings() {
auto q_param = tensor_wrapper_->GetQuantizeParams();
size_t stride[1]{sizeof(EncodingData)};

switch (q_param.quantizationEncoding) {
case QNN_QUANTIZATION_ENCODING_SCALE_OFFSET: {
Qnn_ScaleOffset_t data = q_param.scaleOffsetEncoding;
size_t shape[1]{1};
auto enc_data = py::array_t<EncodingData>(shape, stride);
auto view = enc_data.mutable_unchecked<1>();
view(0) = {data.scale, data.offset};
return {enc_data, -1};
}
case QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET: {
Qnn_AxisScaleOffset_t data = q_param.axisScaleOffsetEncoding;
size_t shape[1]{data.numScaleOffsets};
auto enc_data = py::array_t<EncodingData>(shape, stride);
auto view = enc_data.mutable_unchecked<1>();
for (int i = 0; i < enc_data.shape(0); ++i) {
view(i) = {data.scaleOffset[i].scale, data.scaleOffset[i].offset};
}
return {enc_data, data.axis};
}
case QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET: {
Qnn_BwScaleOffset_t data = q_param.bwScaleOffsetEncoding;
size_t shape[1]{1};
auto enc_data = py::array_t<EncodingData>(shape, stride);
auto view = enc_data.mutable_unchecked<1>();
view(0) = {data.scale, data.offset};
return {enc_data, -1};
}
case QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET: {
Qnn_BwAxisScaleOffset_t data = q_param.bwAxisScaleOffsetEncoding;
size_t shape[1]{data.numElements};
auto enc_data = py::array_t<EncodingData>(shape, stride);
auto view = enc_data.mutable_unchecked<1>();
for (int i = 0; i < enc_data.shape(0); ++i) {
view(i) = {data.scales[i], data.offsets[i]};
}
return {enc_data, data.axis};
}
default:
QNN_EXECUTORCH_LOG_ERROR(
"%s QNN_QUANTIZATION_ENCODING_UNDEFINED detected",
GetName().c_str());
break;
}
return {};
}

private:
std::shared_ptr<TensorWrapper> tensor_wrapper_;
};
} // namespace qnn
} // namespace executor
} // namespace torch
6 changes: 5 additions & 1 deletion backends/qualcomm/aot/wrappers/TensorWrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,11 @@ class TensorWrapper {
return QNN_VER_PTR(tensor_)->memType;
};

std::string GetName() const {
Qnn_QuantizeParams_t GetQuantizeParams() const {
return QNN_VER_PTR(tensor_)->quantizeParams;
}

const std::string& GetName() const {
return qnn_tensor_name_;
};

Expand Down
Loading
Loading