Skip to content

Commit 14ddfd4

Browse files
authored
Qualcomm AI Engine Direct - CI for QNN Static Stories Llama (#7884)
* Add Static Stories Llama CI * Enable x86 runner for static llama, create a script for static llama ci
1 parent 5c52fbe commit 14ddfd4

File tree

7 files changed

+223
-49
lines changed

7 files changed

+223
-49
lines changed

.ci/scripts/test_qnn_static_llama.sh

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/bin/bash
2+
# Copyright (c) Qualcomm Innovation Center, Inc.
3+
# All rights reserved
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -exu
9+
10+
source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
11+
12+
export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
13+
export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
14+
export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
15+
export PYTHONPATH=".."
16+
cp schema/program.fbs exir/_serialize/program.fbs
17+
cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
18+
cp -f build-x86/backends/qualcomm/PyQnnManagerAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
19+
cp -f build-x86/backends/qualcomm/PyQnnWrapperAdaptor.cpython-310-x86_64-linux-gnu.so backends/qualcomm/python
20+
21+
if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
22+
PYTHON_EXECUTABLE=python3
23+
fi
24+
25+
which "${PYTHON_EXECUTABLE}"
26+
27+
# Although static llama CI does not require graphviz, it is required by test_qnn_delegate.py
28+
pip install graphviz
29+
30+
# Download stories llama110m artifacts
31+
download_stories_model_artifacts
32+
echo "Creating tokenizer.bin"
33+
$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
34+
35+
set +e
36+
# Compile only as weight sharing is not applicable on x86
37+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only
38+
exit_code1=$?
39+
40+
# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
41+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --enable_x86_64
42+
exit_code2=$?
43+
44+
# Check the exit codes and print messages
45+
if [ $exit_code1 -ne 0 ]; then
46+
echo "Static Llama compile only with weight sharing test failed. $exit_code1."
47+
fi
48+
49+
if [ $exit_code2 -ne 0 ]; then
50+
echo "Static Llama accuracy test failed. $exit_code2."
51+
fi
52+
53+
# Return failure if either program failed
54+
if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then
55+
exit 1
56+
else
57+
exit 0
58+
fi
59+
set -e

.github/workflows/pull.yml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,39 @@ jobs:
437437
# Test llama2
438438
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
439439
440+
test-static-llama-qnn-linux:
441+
name: test-static-llama-qnn-linux
442+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
443+
permissions:
444+
id-token: write
445+
contents: read
446+
strategy:
447+
fail-fast: false
448+
with:
449+
runner: linux.2xlarge
450+
docker-image: executorch-ubuntu-22.04-qnn-sdk
451+
submodules: 'true'
452+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
453+
timeout: 180
454+
script: |
455+
# The generic Linux job chooses to use base env, not the one setup by the image
456+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
457+
conda activate "${CONDA_ENV}"
458+
459+
BUILD_TOOL="cmake"
460+
461+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
462+
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
463+
464+
# Setup executorch
465+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
466+
467+
# Setup install_requirements for llama
468+
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
469+
470+
# Test static llama weight sharing and accuracy
471+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_qnn_static_llama.sh
472+
440473
test-qnn-models-linux:
441474
name: test-qnn-models-linux
442475
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main

backends/qualcomm/tests/test_qnn_delegate.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2014,6 +2014,7 @@ def test_qnn_backend_multi_graphs(self):
20142014
soc_model=self.chipset_table[TestQNN.model],
20152015
backend_options=backend_options,
20162016
multiple_graphs=True,
2017+
weight_sharing=True,
20172018
graph_name=graph_name,
20182019
)
20192020
for graph_name in graph_names
@@ -2577,6 +2578,7 @@ def test_qnn_backend_multi_graphs(self):
25772578
soc_model=self.chipset_table[TestQNN.model],
25782579
backend_options=backend_options,
25792580
multiple_graphs=True,
2581+
weight_sharing=True,
25802582
graph_name=graph_name,
25812583
)
25822584
for graph_name in graph_names
@@ -3822,8 +3824,6 @@ def test_stories_single_llama(self):
38223824
self.artifact_dir,
38233825
"--build_folder",
38243826
self.build_folder,
3825-
"--device",
3826-
self.device,
38273827
"--model",
38283828
self.model,
38293829
"--checkpoint",
@@ -3846,9 +3846,21 @@ def test_stories_single_llama(self):
38463846
"0",
38473847
"--llama_model",
38483848
"stories110m",
3849+
"--model_mode",
3850+
"hybrid",
3851+
"--prefill_seq_len",
3852+
"32",
3853+
"--kv_seq_len",
3854+
"128",
38493855
]
3856+
if self.compile_only:
3857+
cmds.extend(["--compile_only"])
3858+
elif self.device:
3859+
cmds.extend(["--device", self.device])
38503860
if self.host:
38513861
cmds.extend(["--host", self.host])
3862+
elif self.enable_x86_64:
3863+
cmds.extend(["--enable_x86_64"])
38523864

38533865
golden_start_with = "Once upon a time,"
38543866
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
@@ -3859,8 +3871,13 @@ def test_stories_single_llama(self):
38593871
if "Error" in msg:
38603872
self.fail(msg["Error"])
38613873
else:
3862-
model_out = msg["result"][0]
3863-
self.assertTrue(model_out.startswith(golden_start_with))
3874+
if not self.compile_only:
3875+
model_out = msg["result"][0]
3876+
self.assertTrue(model_out.startswith(golden_start_with))
3877+
# x86 does not allow weight sharing, so we don't check pte size
3878+
if not self.enable_x86_64:
3879+
pte_size = msg["pte_size"]
3880+
self.assertLessEqual(pte_size, 130000000)
38643881

38653882
@unittest.skip("dynamic shape inputs appear in recent torch.export.export")
38663883
def test_mobilebert(self):
@@ -4065,12 +4082,6 @@ def setup_environment():
40654082
help="Path to open source software model repository",
40664083
type=str,
40674084
)
4068-
parser.add_argument(
4069-
"-x",
4070-
"--enable_x86_64",
4071-
help="Enable unittest to be executed on x86_64 platform",
4072-
action="store_true",
4073-
)
40744085

40754086
args, ns_args = parser.parse_known_args(namespace=unittest)
40764087
TestQNN.host = args.host
@@ -4089,6 +4100,8 @@ def setup_environment():
40894100
TestQNN.shared_buffer = args.shared_buffer
40904101
TestQNN.enable_x86_64 = args.enable_x86_64
40914102
TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs
4103+
TestQNN.compile_only = args.compile_only
4104+
40924105
return sys.argv[:1] + ns_args
40934106

40944107

backends/qualcomm/tests/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ class TestQNN(unittest.TestCase):
182182
use_16a4w: str = "16a4w"
183183
shared_buffer: bool = False
184184
enable_x86_64: bool = False
185+
compile_only: bool = False
185186

186187
def _assert_outputs_equal(self, model_output, ref_output):
187188
self.assertTrue(len(ref_output) == len(model_output))

backends/qualcomm/utils/utils.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1166,6 +1166,7 @@ def generate_qnn_executorch_compiler_spec(
11661166
shared_buffer: bool = False,
11671167
is_from_context_binary: bool = False,
11681168
multiple_graphs: bool = False,
1169+
weight_sharing: bool = False,
11691170
graph_name: str = "forward",
11701171
) -> List[CompileSpec]:
11711172
"""
@@ -1196,6 +1197,7 @@ def generate_qnn_executorch_compiler_spec(
11961197
is_from_context_binary: True if current graph comes from pre-built context binary.
11971198
multiple_graphs: True if multiple methods are expected to have in single .pte file.
11981199
Please see test cases for post-processing example.
1200+
weight_sharing: Used with multiple_graphs, where model size will be reduced when operations have the same weights across multiple graphs.
11991201
graph_name: Assign unique graph name if 'multiple_graphs' is used.
12001202
12011203
Returns:
@@ -1216,6 +1218,12 @@ def generate_qnn_executorch_compiler_spec(
12161218
stacklevel=1,
12171219
)
12181220

1221+
if weight_sharing and not multiple_graphs:
1222+
warnings.warn(
1223+
"Weight sharing is intended for multiple graphs scenario, please ensure if there are multiple graphs",
1224+
stacklevel=1,
1225+
)
1226+
12191227
qnn_executorch_options = QnnExecuTorchOptions(
12201228
_soc_info_table[soc_model], backend_options
12211229
)
@@ -1257,7 +1265,10 @@ def generate_qnn_executorch_compiler_spec(
12571265

12581266
if multiple_graphs:
12591267
# enable weight sharing mechanism if multiple graphs appear
1260-
if backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend:
1268+
if (
1269+
backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend
1270+
and weight_sharing
1271+
):
12611272
backend_options.htp_options.use_weight_sharing = True
12621273

12631274
return [

0 commit comments

Comments
 (0)