Skip to content

Commit f0ef51c

Browse files
authored
Qualcomm AI Engine Direct - CI For Llama (#8512)
Enable inference speed test and 1b test
1 parent cc3974f commit f0ef51c

File tree

5 files changed

+204
-68
lines changed

5 files changed

+204
-68
lines changed

.ci/scripts/test_qnn_static_llama.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@ $PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o to
3434

3535
set +e
3636
# Compile only as weight sharing is not applicable on x86
37-
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only
37+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --llama_artifacts . --compile_only
3838
exit_code1=$?
3939

4040
# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
41-
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --enable_x86_64
41+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts . --enable_x86_64
4242
exit_code2=$?
4343

4444
# Check the exit codes and print messages

backends/qualcomm/tests/test_qnn_delegate.py

Lines changed: 181 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -3106,6 +3106,173 @@ def test_qnn_backend_draw_graph(self):
31063106
), "Generated .dot file does not match the golden file."
31073107

31083108

3109+
class TestExampleLLMScript(TestQNN):
3110+
def required_envs(self, conditions=None) -> bool:
3111+
conditions = [] if conditions is None else conditions
3112+
return all(
3113+
[
3114+
self.executorch_root,
3115+
self.artifact_dir,
3116+
*conditions,
3117+
]
3118+
)
3119+
3120+
def test_llama3_2_1b(self):
3121+
if not self.required_envs():
3122+
self.skipTest("missing required envs")
3123+
assert (
3124+
self.llama_artifacts is not None
3125+
), "Please provide path to llama artifacts"
3126+
3127+
prompt = "What is the meaning of life?"
3128+
cmds = [
3129+
"python",
3130+
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
3131+
"--artifact",
3132+
self.artifact_dir,
3133+
"--build_folder",
3134+
self.build_folder,
3135+
"--model",
3136+
self.model,
3137+
"--checkpoint",
3138+
f"{self.llama_artifacts}/consolidated.00.pth",
3139+
"--params",
3140+
f"{self.llama_artifacts}/params.json",
3141+
"--tokenizer_model",
3142+
f"{self.llama_artifacts}/tokenizer.model",
3143+
"--ip",
3144+
self.ip,
3145+
"--port",
3146+
str(self.port),
3147+
"--prompt",
3148+
f"{prompt}",
3149+
"--ptq",
3150+
"16a4w",
3151+
"--temperature",
3152+
"0",
3153+
"--llama_model",
3154+
"llama3_2",
3155+
"--model_mode",
3156+
"hybrid",
3157+
"--prefill_seq_len",
3158+
"32",
3159+
"--kv_seq_len",
3160+
"512",
3161+
"--num_sharding",
3162+
"4",
3163+
]
3164+
if self.compile_only:
3165+
cmds.extend(["--compile_only"])
3166+
elif self.device:
3167+
cmds.extend(["--device", self.device])
3168+
if self.host:
3169+
cmds.extend(["--host", self.host])
3170+
elif self.enable_x86_64:
3171+
cmds.extend(["--enable_x86_64"])
3172+
if self.pre_gen_pte:
3173+
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
3174+
3175+
golden_start_with = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>"
3176+
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
3177+
with Listener((self.ip, self.port)) as listener:
3178+
conn = listener.accept()
3179+
p.communicate()
3180+
msg = json.loads(conn.recv())
3181+
if "Error" in msg:
3182+
self.fail(msg["Error"])
3183+
else:
3184+
if not self.compile_only:
3185+
model_out = msg["result"][0]
3186+
self.assertTrue(
3187+
model_out.startswith(golden_start_with),
3188+
f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
3189+
)
3190+
# x86 does not allow weight sharing, so we don't check pte size.
3191+
# Inference speed on x86 is slow, so we only check when running on Android
3192+
if not self.enable_x86_64:
3193+
pte_size = msg["pte_size"]
3194+
self.assertLessEqual(pte_size, 1300000000)
3195+
if not self.compile_only and not self.enable_x86_64:
3196+
self.assertGreaterEqual(msg["inference_speed"], 66) # Lanai
3197+
3198+
def test_llama_stories_110m(self):
3199+
if not self.required_envs():
3200+
self.skipTest("missing required envs")
3201+
assert (
3202+
self.llama_artifacts is not None
3203+
), "Please provide path to llama artifacts"
3204+
3205+
prompt = "Once"
3206+
cmds = [
3207+
"python",
3208+
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
3209+
"--artifact",
3210+
self.artifact_dir,
3211+
"--build_folder",
3212+
self.build_folder,
3213+
"--model",
3214+
self.model,
3215+
"--checkpoint",
3216+
f"{self.llama_artifacts}/stories110M.pt",
3217+
"--params",
3218+
f"{self.llama_artifacts}/params.json",
3219+
"--tokenizer_model",
3220+
f"{self.llama_artifacts}/tokenizer.model",
3221+
"--tokenizer_bin",
3222+
f"{self.llama_artifacts}/tokenizer.bin",
3223+
"--ip",
3224+
self.ip,
3225+
"--port",
3226+
str(self.port),
3227+
"--prompt",
3228+
f"{prompt}",
3229+
"--ptq",
3230+
"16a4w",
3231+
"--temperature",
3232+
"0",
3233+
"--llama_model",
3234+
"stories110m",
3235+
"--model_mode",
3236+
"hybrid",
3237+
"--prefill_seq_len",
3238+
"32",
3239+
"--kv_seq_len",
3240+
"128",
3241+
]
3242+
if self.compile_only:
3243+
cmds.extend(["--compile_only"])
3244+
elif self.device:
3245+
cmds.extend(["--device", self.device])
3246+
if self.host:
3247+
cmds.extend(["--host", self.host])
3248+
elif self.enable_x86_64:
3249+
cmds.extend(["--enable_x86_64"])
3250+
if self.pre_gen_pte:
3251+
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
3252+
3253+
golden_start_with = "Once upon a time,"
3254+
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
3255+
with Listener((self.ip, self.port)) as listener:
3256+
conn = listener.accept()
3257+
p.communicate()
3258+
msg = json.loads(conn.recv())
3259+
if "Error" in msg:
3260+
self.fail(msg["Error"])
3261+
else:
3262+
if not self.compile_only:
3263+
model_out = msg["result"][0]
3264+
self.assertTrue(
3265+
model_out.startswith(golden_start_with),
3266+
f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
3267+
)
3268+
# x86 does not allow weight sharing, so we don't check pte size
3269+
if not self.enable_x86_64:
3270+
pte_size = msg["pte_size"]
3271+
self.assertLessEqual(pte_size, 130000000)
3272+
if not self.compile_only and not self.enable_x86_64:
3273+
self.assertGreaterEqual(msg["inference_speed"], 220) # Lanai
3274+
3275+
31093276
class TestExampleOssScript(TestQNN):
31103277
def required_envs(self, conditions=None) -> bool:
31113278
conditions = [] if conditions is None else conditions
@@ -4001,72 +4168,6 @@ def test_deeplab_v3(self):
40014168
self.assertGreaterEqual(msg["MPA"], 0.70)
40024169
self.assertGreaterEqual(msg["MIoU"], 0.55)
40034170

4004-
def test_stories_single_llama(self):
4005-
if not self.required_envs():
4006-
self.skipTest("missing required envs")
4007-
4008-
cmds = [
4009-
"python",
4010-
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
4011-
"--artifact",
4012-
self.artifact_dir,
4013-
"--build_folder",
4014-
self.build_folder,
4015-
"--model",
4016-
self.model,
4017-
"--checkpoint",
4018-
f"{self.artifact_dir}/stories110M.pt",
4019-
"--params",
4020-
f"{self.artifact_dir}/params.json",
4021-
"--tokenizer_model",
4022-
f"{self.artifact_dir}/tokenizer.model",
4023-
"--tokenizer_bin",
4024-
f"{self.artifact_dir}/tokenizer.bin",
4025-
"--ip",
4026-
self.ip,
4027-
"--port",
4028-
str(self.port),
4029-
"--prompt",
4030-
"Once",
4031-
"--ptq",
4032-
"16a4w",
4033-
"--temperature",
4034-
"0",
4035-
"--llama_model",
4036-
"stories110m",
4037-
"--model_mode",
4038-
"hybrid",
4039-
"--prefill_seq_len",
4040-
"32",
4041-
"--kv_seq_len",
4042-
"128",
4043-
]
4044-
if self.compile_only:
4045-
cmds.extend(["--compile_only"])
4046-
elif self.device:
4047-
cmds.extend(["--device", self.device])
4048-
if self.host:
4049-
cmds.extend(["--host", self.host])
4050-
elif self.enable_x86_64:
4051-
cmds.extend(["--enable_x86_64"])
4052-
4053-
golden_start_with = "Once upon a time,"
4054-
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
4055-
with Listener((self.ip, self.port)) as listener:
4056-
conn = listener.accept()
4057-
p.communicate()
4058-
msg = json.loads(conn.recv())
4059-
if "Error" in msg:
4060-
self.fail(msg["Error"])
4061-
else:
4062-
if not self.compile_only:
4063-
model_out = msg["result"][0]
4064-
self.assertTrue(model_out.startswith(golden_start_with))
4065-
# x86 does not allow weight sharing, so we don't check pte size
4066-
if not self.enable_x86_64:
4067-
pte_size = msg["pte_size"]
4068-
self.assertLessEqual(pte_size, 130000000)
4069-
40704171
@unittest.skip("dynamic shape inputs appear in recent torch.export.export")
40714172
def test_mobilebert(self):
40724173
if not self.required_envs([self.pretrained_weight]):
@@ -4271,6 +4372,18 @@ def setup_environment():
42714372
type=str,
42724373
)
42734374

4375+
parser.add_argument(
4376+
"--pre_gen_pte",
4377+
help="Run the pre-generated pte in the given directory.",
4378+
type=str,
4379+
)
4380+
4381+
parser.add_argument(
4382+
"--llama_artifacts",
4383+
help="A folder that contains: weight, tokenizer, and params.",
4384+
type=str,
4385+
)
4386+
42744387
args, ns_args = parser.parse_known_args(namespace=unittest)
42754388
TestQNN.host = args.host
42764389
TestQNN.device = args.device
@@ -4289,6 +4402,8 @@ def setup_environment():
42894402
TestQNN.enable_x86_64 = args.enable_x86_64
42904403
TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs
42914404
TestQNN.compile_only = args.compile_only
4405+
TestQNN.pre_gen_pte = args.pre_gen_pte
4406+
TestQNN.llama_artifacts = args.llama_artifacts
42924407

42934408
return sys.argv[:1] + ns_args
42944409

backends/qualcomm/tests/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,8 @@ class TestQNN(unittest.TestCase):
188188
shared_buffer: bool = False
189189
enable_x86_64: bool = False
190190
compile_only: bool = False
191+
pre_gen_pte: str = ""
192+
llama_artifacts: str = ""
191193

192194
def _assert_outputs_equal(self, model_output, ref_output):
193195
self.assertTrue(len(ref_output) == len(model_output))

examples/qualcomm/oss_scripts/llama/llama.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -881,13 +881,18 @@ def post_process():
881881

882882
adb.pull(output_path=args.artifact, callback=post_process)
883883
if args.ip and args.port != -1:
884+
inference_speed = 0
885+
with open(f"{args.artifact}/outputs/inference_speed.txt", "r") as f:
886+
inference_speed = float(f.read())
887+
884888
pte_size = os.path.getsize(pte_path)
885889
with Client((args.ip, args.port)) as conn:
886890
conn.send(
887891
json.dumps(
888892
{
889893
"result": outputs,
890894
"pte_size": pte_size,
895+
"inference_speed": inference_speed,
891896
}
892897
)
893898
)

examples/qualcomm/oss_scripts/llama/runner/runner.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
1919
#include <executorch/runtime/platform/log.h>
2020
#include <ctime>
21+
#include <fstream>
2122
#include <sstream>
2223

2324
using executorch::aten::Tensor;
@@ -518,6 +519,19 @@ void printReport(const Runner::Stats& stats) {
518519
stats.num_generated_tokens,
519520
(double)stats.aggregate_sampling_time_ms /
520521
stats.SCALING_FACTOR_UNITS_PER_SECOND);
522+
523+
// For now, we just print the total inference time for CI, can save more info
524+
// in future if needed.
525+
std::ofstream outfile("outputs/inference_speed.txt");
526+
if (outfile.is_open()) {
527+
double num_tok = (stats.num_generated_tokens) /
528+
(double)(stats.inference_end_ms - stats.inference_start_ms) *
529+
stats.SCALING_FACTOR_UNITS_PER_SECOND;
530+
outfile << num_tok;
531+
outfile.close();
532+
} else {
533+
ET_CHECK_MSG(false, "Error saving the inference speed file");
534+
}
521535
}
522536

523537
std::string statsToJsonString(const Runner::Stats& stats) {

0 commit comments

Comments
 (0)