@@ -3106,6 +3106,173 @@ def test_qnn_backend_draw_graph(self):
3106
3106
), "Generated .dot file does not match the golden file."
3107
3107
3108
3108
3109
+ class TestExampleLLMScript (TestQNN ):
3110
+ def required_envs (self , conditions = None ) -> bool :
3111
+ conditions = [] if conditions is None else conditions
3112
+ return all (
3113
+ [
3114
+ self .executorch_root ,
3115
+ self .artifact_dir ,
3116
+ * conditions ,
3117
+ ]
3118
+ )
3119
+
3120
+ def test_llama3_2_1b (self ):
3121
+ if not self .required_envs ():
3122
+ self .skipTest ("missing required envs" )
3123
+ assert (
3124
+ self .llama_artifacts is not None
3125
+ ), "Please provide path to llama artifacts"
3126
+
3127
+ prompt = "What is the meaning of life?"
3128
+ cmds = [
3129
+ "python" ,
3130
+ f"{ self .executorch_root } /examples/qualcomm/oss_scripts/llama/llama.py" ,
3131
+ "--artifact" ,
3132
+ self .artifact_dir ,
3133
+ "--build_folder" ,
3134
+ self .build_folder ,
3135
+ "--model" ,
3136
+ self .model ,
3137
+ "--checkpoint" ,
3138
+ f"{ self .llama_artifacts } /consolidated.00.pth" ,
3139
+ "--params" ,
3140
+ f"{ self .llama_artifacts } /params.json" ,
3141
+ "--tokenizer_model" ,
3142
+ f"{ self .llama_artifacts } /tokenizer.model" ,
3143
+ "--ip" ,
3144
+ self .ip ,
3145
+ "--port" ,
3146
+ str (self .port ),
3147
+ "--prompt" ,
3148
+ f"{ prompt } " ,
3149
+ "--ptq" ,
3150
+ "16a4w" ,
3151
+ "--temperature" ,
3152
+ "0" ,
3153
+ "--llama_model" ,
3154
+ "llama3_2" ,
3155
+ "--model_mode" ,
3156
+ "hybrid" ,
3157
+ "--prefill_seq_len" ,
3158
+ "32" ,
3159
+ "--kv_seq_len" ,
3160
+ "512" ,
3161
+ "--num_sharding" ,
3162
+ "4" ,
3163
+ ]
3164
+ if self .compile_only :
3165
+ cmds .extend (["--compile_only" ])
3166
+ elif self .device :
3167
+ cmds .extend (["--device" , self .device ])
3168
+ if self .host :
3169
+ cmds .extend (["--host" , self .host ])
3170
+ elif self .enable_x86_64 :
3171
+ cmds .extend (["--enable_x86_64" ])
3172
+ if self .pre_gen_pte :
3173
+ cmds .extend (["--pre_gen_pte" , self .pre_gen_pte ])
3174
+
3175
+ golden_start_with = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>"
3176
+ p = subprocess .Popen (cmds , stdout = subprocess .DEVNULL )
3177
+ with Listener ((self .ip , self .port )) as listener :
3178
+ conn = listener .accept ()
3179
+ p .communicate ()
3180
+ msg = json .loads (conn .recv ())
3181
+ if "Error" in msg :
3182
+ self .fail (msg ["Error" ])
3183
+ else :
3184
+ if not self .compile_only :
3185
+ model_out = msg ["result" ][0 ]
3186
+ self .assertTrue (
3187
+ model_out .startswith (golden_start_with ),
3188
+ f"Expected Output: { golden_start_with } . Actual Output: { model_out } " ,
3189
+ )
3190
+ # x86 does not allow weight sharing, so we don't check pte size.
3191
+ # Inference speed on x86 is slow, so we only check when running on Android
3192
+ if not self .enable_x86_64 :
3193
+ pte_size = msg ["pte_size" ]
3194
+ self .assertLessEqual (pte_size , 1300000000 )
3195
+ if not self .compile_only and not self .enable_x86_64 :
3196
+ self .assertGreaterEqual (msg ["inference_speed" ], 66 ) # Lanai
3197
+
3198
+ def test_llama_stories_110m (self ):
3199
+ if not self .required_envs ():
3200
+ self .skipTest ("missing required envs" )
3201
+ assert (
3202
+ self .llama_artifacts is not None
3203
+ ), "Please provide path to llama artifacts"
3204
+
3205
+ prompt = "Once"
3206
+ cmds = [
3207
+ "python" ,
3208
+ f"{ self .executorch_root } /examples/qualcomm/oss_scripts/llama/llama.py" ,
3209
+ "--artifact" ,
3210
+ self .artifact_dir ,
3211
+ "--build_folder" ,
3212
+ self .build_folder ,
3213
+ "--model" ,
3214
+ self .model ,
3215
+ "--checkpoint" ,
3216
+ f"{ self .llama_artifacts } /stories110M.pt" ,
3217
+ "--params" ,
3218
+ f"{ self .llama_artifacts } /params.json" ,
3219
+ "--tokenizer_model" ,
3220
+ f"{ self .llama_artifacts } /tokenizer.model" ,
3221
+ "--tokenizer_bin" ,
3222
+ f"{ self .llama_artifacts } /tokenizer.bin" ,
3223
+ "--ip" ,
3224
+ self .ip ,
3225
+ "--port" ,
3226
+ str (self .port ),
3227
+ "--prompt" ,
3228
+ f"{ prompt } " ,
3229
+ "--ptq" ,
3230
+ "16a4w" ,
3231
+ "--temperature" ,
3232
+ "0" ,
3233
+ "--llama_model" ,
3234
+ "stories110m" ,
3235
+ "--model_mode" ,
3236
+ "hybrid" ,
3237
+ "--prefill_seq_len" ,
3238
+ "32" ,
3239
+ "--kv_seq_len" ,
3240
+ "128" ,
3241
+ ]
3242
+ if self .compile_only :
3243
+ cmds .extend (["--compile_only" ])
3244
+ elif self .device :
3245
+ cmds .extend (["--device" , self .device ])
3246
+ if self .host :
3247
+ cmds .extend (["--host" , self .host ])
3248
+ elif self .enable_x86_64 :
3249
+ cmds .extend (["--enable_x86_64" ])
3250
+ if self .pre_gen_pte :
3251
+ cmds .extend (["--pre_gen_pte" , self .pre_gen_pte ])
3252
+
3253
+ golden_start_with = "Once upon a time,"
3254
+ p = subprocess .Popen (cmds , stdout = subprocess .DEVNULL )
3255
+ with Listener ((self .ip , self .port )) as listener :
3256
+ conn = listener .accept ()
3257
+ p .communicate ()
3258
+ msg = json .loads (conn .recv ())
3259
+ if "Error" in msg :
3260
+ self .fail (msg ["Error" ])
3261
+ else :
3262
+ if not self .compile_only :
3263
+ model_out = msg ["result" ][0 ]
3264
+ self .assertTrue (
3265
+ model_out .startswith (golden_start_with ),
3266
+ f"Expected Output: { golden_start_with } . Actual Output: { model_out } " ,
3267
+ )
3268
+ # x86 does not allow weight sharing, so we don't check pte size
3269
+ if not self .enable_x86_64 :
3270
+ pte_size = msg ["pte_size" ]
3271
+ self .assertLessEqual (pte_size , 130000000 )
3272
+ if not self .compile_only and not self .enable_x86_64 :
3273
+ self .assertGreaterEqual (msg ["inference_speed" ], 220 ) # Lanai
3274
+
3275
+
3109
3276
class TestExampleOssScript (TestQNN ):
3110
3277
def required_envs (self , conditions = None ) -> bool :
3111
3278
conditions = [] if conditions is None else conditions
@@ -4001,72 +4168,6 @@ def test_deeplab_v3(self):
4001
4168
self .assertGreaterEqual (msg ["MPA" ], 0.70 )
4002
4169
self .assertGreaterEqual (msg ["MIoU" ], 0.55 )
4003
4170
4004
- def test_stories_single_llama (self ):
4005
- if not self .required_envs ():
4006
- self .skipTest ("missing required envs" )
4007
-
4008
- cmds = [
4009
- "python" ,
4010
- f"{ self .executorch_root } /examples/qualcomm/oss_scripts/llama/llama.py" ,
4011
- "--artifact" ,
4012
- self .artifact_dir ,
4013
- "--build_folder" ,
4014
- self .build_folder ,
4015
- "--model" ,
4016
- self .model ,
4017
- "--checkpoint" ,
4018
- f"{ self .artifact_dir } /stories110M.pt" ,
4019
- "--params" ,
4020
- f"{ self .artifact_dir } /params.json" ,
4021
- "--tokenizer_model" ,
4022
- f"{ self .artifact_dir } /tokenizer.model" ,
4023
- "--tokenizer_bin" ,
4024
- f"{ self .artifact_dir } /tokenizer.bin" ,
4025
- "--ip" ,
4026
- self .ip ,
4027
- "--port" ,
4028
- str (self .port ),
4029
- "--prompt" ,
4030
- "Once" ,
4031
- "--ptq" ,
4032
- "16a4w" ,
4033
- "--temperature" ,
4034
- "0" ,
4035
- "--llama_model" ,
4036
- "stories110m" ,
4037
- "--model_mode" ,
4038
- "hybrid" ,
4039
- "--prefill_seq_len" ,
4040
- "32" ,
4041
- "--kv_seq_len" ,
4042
- "128" ,
4043
- ]
4044
- if self .compile_only :
4045
- cmds .extend (["--compile_only" ])
4046
- elif self .device :
4047
- cmds .extend (["--device" , self .device ])
4048
- if self .host :
4049
- cmds .extend (["--host" , self .host ])
4050
- elif self .enable_x86_64 :
4051
- cmds .extend (["--enable_x86_64" ])
4052
-
4053
- golden_start_with = "Once upon a time,"
4054
- p = subprocess .Popen (cmds , stdout = subprocess .DEVNULL )
4055
- with Listener ((self .ip , self .port )) as listener :
4056
- conn = listener .accept ()
4057
- p .communicate ()
4058
- msg = json .loads (conn .recv ())
4059
- if "Error" in msg :
4060
- self .fail (msg ["Error" ])
4061
- else :
4062
- if not self .compile_only :
4063
- model_out = msg ["result" ][0 ]
4064
- self .assertTrue (model_out .startswith (golden_start_with ))
4065
- # x86 does not allow weight sharing, so we don't check pte size
4066
- if not self .enable_x86_64 :
4067
- pte_size = msg ["pte_size" ]
4068
- self .assertLessEqual (pte_size , 130000000 )
4069
-
4070
4171
@unittest .skip ("dynamic shape inputs appear in recent torch.export.export" )
4071
4172
def test_mobilebert (self ):
4072
4173
if not self .required_envs ([self .pretrained_weight ]):
@@ -4271,6 +4372,18 @@ def setup_environment():
4271
4372
type = str ,
4272
4373
)
4273
4374
4375
+ parser .add_argument (
4376
+ "--pre_gen_pte" ,
4377
+ help = "Run the pre-generated pte in the given directory." ,
4378
+ type = str ,
4379
+ )
4380
+
4381
+ parser .add_argument (
4382
+ "--llama_artifacts" ,
4383
+ help = "A folder that contains: weight, tokenizer, and params." ,
4384
+ type = str ,
4385
+ )
4386
+
4274
4387
args , ns_args = parser .parse_known_args (namespace = unittest )
4275
4388
TestQNN .host = args .host
4276
4389
TestQNN .device = args .device
@@ -4289,6 +4402,8 @@ def setup_environment():
4289
4402
TestQNN .enable_x86_64 = args .enable_x86_64
4290
4403
TestQNN .dump_intermediate_outputs = args .dump_intermediate_outputs
4291
4404
TestQNN .compile_only = args .compile_only
4405
+ TestQNN .pre_gen_pte = args .pre_gen_pte
4406
+ TestQNN .llama_artifacts = args .llama_artifacts
4292
4407
4293
4408
return sys .argv [:1 ] + ns_args
4294
4409
0 commit comments