Skip to content

Commit 1090bcd

Browse files
chuntlfacebook-github-bot
authored andcommitted
Qualcomm AI Engine Direct - Enable HTP emulator test in x86 host (#4503)
Summary: - Enable x64 runner - Enable HTP emulator test on unit test - Fix unexpected error message - Fix multi-contexts UT's mismatching datatype issue - Port x64 dequantize flow instead of using arm_neon intrinsics - Fix EtDump flow on runner and unittest Pull Request resolved: #4503 Reviewed By: digantdesai Differential Revision: D60598800 Pulled By: cccclai fbshipit-source-id: bfb9df7948c3f64b2bd0e140836dfbd2d4655c0b
1 parent d59419c commit 1090bcd

File tree

11 files changed

+160
-61
lines changed

11 files changed

+160
-61
lines changed

backends/qualcomm/aot/ir/qcir_utils.cpp

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ Qnn_DataType_t ToDataType(qcir::DataType type) {
100100
}
101101

102102
flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
103-
const Qnn_QuantizeParams_t& param,
103+
const Qnn_Tensor_t& tensor,
104104
flatbuffers::FlatBufferBuilder* builder) {
105105
static const std::unordered_map<Qnn_Definition_t, qcir::QuantizeDef> def_map{
106106
{QNN_DEFINITION_IMPL_GENERATED, qcir::QuantizeDef::IMPL_GENERATED},
@@ -124,6 +124,7 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
124124

125125
int32_t axis = 0;
126126
uint32_t bitwidth = 0;
127+
auto param = QNN_VER_PTR(tensor)->quantizeParams;
127128
auto quant_type = type_map.at(param.quantizationEncoding);
128129
std::vector<qcir::ScaleOffset> data;
129130
std::vector<float> scales;
@@ -160,7 +161,9 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
160161
}
161162
} break;
162163
default:
163-
QNN_EXECUTORCH_LOG_ERROR("QNN_QUANTIZATION_ENCODING_UNDEFINED detected");
164+
QNN_EXECUTORCH_LOG_WARN(
165+
"QNN_QUANTIZATION_ENCODING_UNDEFINED detected: %s",
166+
QNN_VER_PTR(tensor)->name);
164167
break;
165168
}
166169
return CreateQuantizeParamDirect(
@@ -174,7 +177,7 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
174177
&data);
175178
}
176179

177-
Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
180+
Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor) {
178181
static const std::unordered_map<qcir::QuantizeDef, Qnn_Definition_t> def_map{
179182
{qcir::QuantizeDef::IMPL_GENERATED, QNN_DEFINITION_IMPL_GENERATED},
180183
{qcir::QuantizeDef::DEFINED, QNN_DEFINITION_DEFINED},
@@ -196,6 +199,7 @@ Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
196199
};
197200

198201
Qnn_QuantizeParams_t p = QNN_QUANTIZE_PARAMS_INIT;
202+
auto param = tensor->qparam();
199203
p.encodingDefinition = def_map.at(param->def());
200204
p.quantizationEncoding = type_map.at(param->type());
201205
switch (p.quantizationEncoding) {
@@ -225,7 +229,9 @@ Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& param) {
225229
const_cast<int32_t*>(param->offsets()->data());
226230
} break;
227231
default:
228-
QNN_EXECUTORCH_LOG_ERROR("qcir::QuantizeType::UNDEFINED detected");
232+
QNN_EXECUTORCH_LOG_WARN(
233+
"qcir::QuantizeType::UNDEFINED detected: %s",
234+
tensor->name()->c_str());
229235
break;
230236
}
231237
return p;
@@ -248,7 +254,7 @@ flatbuffers::Offset<qcir::Tensor> ToTensor(
248254
&shape,
249255
ToTensorType(QNN_VER_PTR(tensor)->type),
250256
ToDataType(QNN_VER_PTR(tensor)->dataType),
251-
ToQuantizeParam(QNN_VER_PTR(tensor)->quantizeParams, builder),
257+
ToQuantizeParam(tensor, builder),
252258
&buffer);
253259
}
254260

@@ -261,7 +267,7 @@ Qnn_Tensor_t ToTensor(const tensor_type& tensor) {
261267
QNN_VER_PTR(t)->name = tensor->name()->c_str();
262268
QNN_VER_PTR(t)->type = ToTensorType(tensor->type());
263269
QNN_VER_PTR(t)->dataType = ToDataType(tensor->dtype());
264-
QNN_VER_PTR(t)->quantizeParams = ToQuantizeParam(tensor->qparam());
270+
QNN_VER_PTR(t)->quantizeParams = ToQuantizeParam(tensor);
265271
QNN_VER_PTR(t)->rank = tensor->shape()->size();
266272
QNN_VER_PTR(t)->dimensions = const_cast<uint32_t*>(tensor->shape()->data());
267273
QNN_VER_PTR(t)->clientBuf.dataSize = tensor->data()->size();

backends/qualcomm/aot/ir/qcir_utils.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@ qcir::DataType ToDataType(Qnn_DataType_t type);
2626
Qnn_DataType_t ToDataType(qcir::DataType type);
2727

2828
flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
29-
const Qnn_QuantizeParams_t& param,
29+
const Qnn_Tensor_t& tensor,
3030
flatbuffers::FlatBufferBuilder* builder);
31-
Qnn_QuantizeParams_t ToQuantizeParam(const qparam_type& type);
31+
Qnn_QuantizeParams_t ToQuantizeParam(const tensor_type& tensor);
3232

3333
flatbuffers::Offset<qcir::Tensor> ToTensor(
3434
const Qnn_Tensor_t& tensor,

backends/qualcomm/runtime/QnnExecuTorchBackend.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,12 @@ Result<DelegateHandle*> QnnExecuTorchBackend::init(
2323
ArrayRef<CompileSpec> compile_specs) const {
2424
// covert SizedBuffer to qnn ExecuTorch option
2525
QnnExecuTorchContextBinary qnn_context_blob;
26-
const qnn_delegate::QnnExecuTorchOptions* qnn_executorch_options;
26+
const qnn_delegate::QnnExecuTorchOptions* qnn_executorch_options = nullptr;
2727

2828
qnn_context_blob.buffer = const_cast<void*>(processed->data());
2929
qnn_context_blob.nbytes = processed->size();
3030

31-
// covert CompileSpec to qnn ExecuTorch option
31+
// convert CompileSpec to qnn ExecuTorch option
3232
for (auto& compile_spec : compile_specs) {
3333
if (std::strcmp(compile_spec.key, QNN_COMPILE_SPEC) == 0)
3434
qnn_executorch_options =

backends/qualcomm/runtime/SharedBuffer.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,12 @@ SharedBuffer& SharedBuffer::GetSharedBufferManager() {
8787
std::lock_guard<std::mutex> lk(init_mutex_);
8888
static SharedBuffer shared_buffer_manager;
8989
if (!shared_buffer_manager.GetInitialize()) {
90+
#if defined(__aarch64__)
9091
Error status = shared_buffer_manager.Load();
92+
#else
93+
// For x86_64 platform
94+
Error status = Error::Ok;
95+
#endif
9196
if (status == Error::Ok) {
9297
shared_buffer_manager.SetInitialize(true);
9398
}
@@ -96,9 +101,11 @@ SharedBuffer& SharedBuffer::GetSharedBufferManager() {
96101
}
97102

98103
SharedBuffer::~SharedBuffer() {
104+
#if defined(__aarch64__)
99105
if (initialize_) {
100106
SharedBuffer::GetSharedBufferManager().UnLoad();
101107
}
108+
#endif
102109
};
103110

104111
void* SharedBuffer::AllocMem(size_t bytes, size_t alignment) {

backends/qualcomm/scripts/build.sh

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -107,19 +107,33 @@ if [ "$BUILD_X86_64" = true ]; then
107107
rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT
108108
fi
109109
cd $BUILD_ROOT
110+
# TODO: Use CMAKE_BUILD_TYPE=RelWithDebInfo, and handle flatcc issues
110111
cmake \
111-
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
112+
-DCMAKE_BUILD_TYPE=Debug \
113+
-DCMAKE_INSTALL_PREFIX=$BUILD_ROOT \
112114
-DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
113115
-DEXECUTORCH_BUILD_QNN=ON \
116+
-DEXECUTORCH_BUILD_SDK=ON \
117+
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
118+
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
114119
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
115-
-DBUCK2=$BUCK2 \
116120
-S $PRJ_ROOT \
117121
-B $BUILD_ROOT \
118122

119-
cmake \
120-
--build $BUILD_ROOT \
121-
-t "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j16
123+
cmake --build $BUILD_ROOT -j16 --target install
122124

123125
rm -f $PRJ_ROOT/backends/qualcomm/python/*
124126
cp -fv $BUILD_ROOT/backends/qualcomm/Py* "$PRJ_ROOT/backends/qualcomm/python"
127+
128+
EXAMPLE_ROOT=examples/qualcomm
129+
CMAKE_PREFIX_PATH="${BUILD_ROOT}/lib/cmake/ExecuTorch;${BUILD_ROOT}/third-party/gflags;"
130+
131+
cmake $PRJ_ROOT/$EXAMPLE_ROOT \
132+
-DCMAKE_BUILD_TYPE=Debug \
133+
-DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH \
134+
-DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
135+
-DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
136+
-B$EXAMPLE_ROOT
137+
138+
cmake --build $EXAMPLE_ROOT -j16
125139
fi

backends/qualcomm/tests/test_qnn_delegate.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ def test_qnn_backend_element_wise_ceil(self):
147147

148148
def test_qnn_backend_element_wise_div(self):
149149
eps = 1e-03
150+
torch.manual_seed(8)
150151
test_comb = [
151152
{
152153
QCOM_MODULE: [Div()], # noqa: F405
@@ -721,6 +722,7 @@ def test_qnn_backend_element_wise_ceil(self):
721722

722723
def test_qnn_backend_element_wise_div(self):
723724
eps = 1e-03
725+
torch.manual_seed(8)
724726
test_comb = [
725727
{
726728
QCOM_MODULE: [Div()], # noqa: F405
@@ -1323,7 +1325,6 @@ def test_qnn_backend_multi_contexts_composite(self):
13231325
exec_prog = edge_prog.to_executorch()
13241326
self.verify_output(module.get_reference_module(), sample_input, exec_prog)
13251327

1326-
@unittest.expectedFailure
13271328
def test_qnn_backend_profile_op(self):
13281329
TestQNN.enable_profile = True
13291330
backend_options = generate_htp_compiler_spec(use_fp16=True)
@@ -1338,7 +1339,7 @@ def test_qnn_backend_profile_op(self):
13381339
module,
13391340
sample_input,
13401341
expected_partitions=1,
1341-
expected_profile_events=25,
1342+
expected_profile_events=24,
13421343
)
13431344

13441345
def test_qnn_backend_shared_buffer(self):
@@ -1488,7 +1489,6 @@ def test_qnn_backend_multi_contexts_composite(self):
14881489
exec_prog = edge_prog.to_executorch()
14891490
self.verify_output(module.get_reference_module(), sample_input, exec_prog)
14901491

1491-
@unittest.expectedFailure
14921492
def test_qnn_backend_profile_op(self):
14931493
TestQNN.enable_profile = True
14941494
backend_options = generate_htp_compiler_spec(use_fp16=False)
@@ -1504,7 +1504,7 @@ def test_qnn_backend_profile_op(self):
15041504
module,
15051505
sample_input,
15061506
expected_partitions=1,
1507-
expected_profile_events=26,
1507+
expected_profile_events=25,
15081508
)
15091509

15101510
def test_qnn_backend_shared_buffer(self):
@@ -2288,6 +2288,12 @@ def setup_environment():
22882288
help="Path to open source software model repository",
22892289
type=str,
22902290
)
2291+
parser.add_argument(
2292+
"-x",
2293+
"--enable_x86_64",
2294+
help="Enable unittest to be executed on x86_64 platform",
2295+
action="store_true",
2296+
)
22912297

22922298
args, ns_args = parser.parse_known_args(namespace=unittest)
22932299
TestQNN.host = args.host
@@ -2304,6 +2310,7 @@ def setup_environment():
23042310
TestQNN.error_only = args.error_only
23052311
TestQNN.oss_repo = args.oss_repo
23062312
TestQNN.shared_buffer = args.shared_buffer
2313+
TestQNN.enable_x86_64 = args.enable_x86_64
23072314
return sys.argv[:1] + ns_args
23082315

23092316

backends/qualcomm/tests/utils.py

Lines changed: 64 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,11 @@
2727
QcomChipset,
2828
)
2929
from executorch.backends.qualcomm.utils.utils import capture_program
30-
from executorch.examples.qualcomm.scripts.utils import SimpleADB
30+
from executorch.examples.qualcomm.scripts.utils import (
31+
generate_inputs,
32+
make_output_dir,
33+
SimpleADB,
34+
)
3135

3236
from executorch.exir.backend.backend_api import to_backend
3337
from executorch.exir.backend.compile_spec_schema import CompileSpec
@@ -133,6 +137,7 @@ class TestQNN(unittest.TestCase):
133137
use_16a16w: str = "16a16w"
134138
use_16a4w: str = "16a4w"
135139
shared_buffer: bool = False
140+
enable_x86_64: bool = False
136141

137142
def _assert_outputs_equal(self, model_output, ref_output):
138143
self.assertTrue(len(ref_output) == len(model_output))
@@ -201,40 +206,75 @@ def verify_output(
201206
tmp_dir,
202207
)
203208

204-
device_output_dir = f"{tmp_dir}/outputs"
205-
device_outputs = []
209+
output_dir = f"{tmp_dir}/outputs"
210+
outputs = []
206211
etdump_path = f"{tmp_dir}/etdump.etdp"
207212

208213
def post_process():
209-
for i, f in enumerate(sorted(os.listdir(device_output_dir))):
210-
filename = os.path.join(device_output_dir, f)
214+
for i, f in enumerate(sorted(os.listdir(output_dir))):
215+
filename = os.path.join(output_dir, f)
211216
output = np.fromfile(filename, dtype=ref_outputs[i].numpy().dtype)
212217
output = torch.from_numpy(output).reshape(ref_outputs[i].shape)
213-
device_outputs.append(output)
218+
outputs.append(output)
214219

215220
def validate_profile():
216221
inspector = Inspector(etdump_path=etdump_path, etrecord=etrecord_path)
217222
self.assertTrue(
218223
len(inspector.to_dataframe().index) == expected_profile_events
219224
)
220225

221-
adb = SimpleADB(
222-
qnn_sdk=os.getenv("QNN_SDK_ROOT"),
223-
build_path=self.build_folder,
224-
pte_path=pte_fname,
225-
workspace="/data/local/tmp/qnn_executorch_test",
226-
device_id=self.device,
227-
host_id=self.host,
228-
soc_model=self.model,
229-
error_only=self.error_only,
230-
)
231-
adb.push(inputs=[sample_inputs], input_list=input_list)
232-
adb.execute()
233-
adb.pull(output_path=tmp_dir, callback=post_process)
234-
self._assert_outputs_equal(device_outputs, ref_outputs)
226+
if self.enable_x86_64:
227+
generate_inputs(tmp_dir, "input_list.txt", [sample_inputs], input_list)
228+
make_output_dir(output_dir)
229+
230+
target = "x86_64-linux-clang"
231+
qnn_sdk = os.environ.get("QNN_SDK_ROOT", None)
232+
assert qnn_sdk, "QNN_SDK_ROOT was not found in environment variable"
233+
234+
build_path = "build_x86_64"
235+
cmds = [
236+
# export LD_LIBRARY_PATH to QNN_SDK_ROOT
237+
f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{self.executorch_root}/{build_path}/lib && "
238+
# qnn_executor_runner
239+
f"{self.executorch_root}/{build_path}/examples/qualcomm/qnn_executor_runner",
240+
f"--model_path {pte_fname}",
241+
f"--input_list_path {tmp_dir}/input_list.txt",
242+
f"--output_folder_path {output_dir}",
243+
]
244+
245+
subprocess.run(
246+
" ".join(cmds),
247+
shell=True,
248+
executable="/bin/bash",
249+
capture_output=True,
250+
cwd=tmp_dir,
251+
)
252+
253+
# Verify the outputs
254+
post_process()
255+
self._assert_outputs_equal(outputs, ref_outputs)
256+
257+
# Verify the etdump
258+
if expected_profile_events != -1:
259+
validate_profile()
260+
else:
261+
adb = SimpleADB(
262+
qnn_sdk=os.getenv("QNN_SDK_ROOT"),
263+
build_path=self.build_folder,
264+
pte_path=pte_fname,
265+
workspace="/data/local/tmp/qnn_executorch_test",
266+
device_id=self.device,
267+
host_id=self.host,
268+
soc_model=self.model,
269+
error_only=self.error_only,
270+
)
271+
adb.push(inputs=[sample_inputs], input_list=input_list)
272+
adb.execute()
273+
adb.pull(output_path=tmp_dir, callback=post_process)
274+
self._assert_outputs_equal(outputs, ref_outputs)
235275

236-
if expected_profile_events != -1:
237-
adb.pull_etdump(etdump_path, callback=validate_profile)
276+
if expected_profile_events != -1:
277+
adb.pull_etdump(etdump_path, callback=validate_profile)
238278

239279
def lower_module_and_test_output(
240280
self,
@@ -362,6 +402,8 @@ def _insert_clone(
362402
(node,),
363403
)
364404
inserted_node.meta["val"] = node.meta["val"]
405+
if "quant_attrs" in node.meta:
406+
inserted_node.meta["quant_attrs"] = node.meta["quant_attrs"]
365407
for user in users:
366408
user.replace_input_with(node, inserted_node)
367409

examples/qualcomm/CMakeLists.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,6 @@
77
set(CMAKE_CXX_STANDARD 17)
88
# qnn_executor_runner: Like executor_runner but with QNN
99

10-
if(NOT ${ANDROID})
11-
message(FATAL_ERROR "Not building Android, quitting...")
12-
endif()
1310
cmake_minimum_required(VERSION 3.19)
1411
project(qualcomm_runner_example)
1512

examples/qualcomm/executor_runner/qnn_executor_runner.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131

3232
#include <gflags/gflags.h>
3333

34+
#include <chrono>
3435
#include <fstream>
3536
#include <memory>
3637

@@ -202,10 +203,8 @@ int main(int argc, char** argv) {
202203
// be used by a single thread at at time, but it can be reused.
203204
//
204205
torch::executor::ETDumpGen etdump_gen = torch::executor::ETDumpGen();
205-
// TODO: So far we have issues with etdump_gen during load_method. Enable it
206-
// after the issues are fixed.
207206
Result<Method> method =
208-
program->load_method(method_name, &memory_manager, nullptr);
207+
program->load_method(method_name, &memory_manager, &etdump_gen);
209208
ET_CHECK_MSG(
210209
method.ok(),
211210
"Loading of method %s failed with status 0x%" PRIx32,

0 commit comments

Comments
 (0)