@@ -108,6 +108,7 @@ jobs:
108
108
declare -A DEVICE_POOL_ARNS
109
109
DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
110
110
DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db"
111
+ DEVICE_POOL_ARNS[google_pixel_8_pro]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a"
111
112
112
113
# Resolve device names with their corresponding ARNs
113
114
if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
@@ -168,18 +169,20 @@ jobs:
168
169
name : export-models
169
170
uses : pytorch/test-infra/.github/workflows/linux_job.yml@main
170
171
needs : set-parameters
172
+ secrets : inherit
171
173
strategy :
172
174
matrix :
173
175
model : ${{ fromJson(needs.set-parameters.outputs.models) }}
174
176
delegate : ${{ fromJson(needs.set-parameters.outputs.delegates) }}
175
177
fail-fast : false
176
178
with :
177
- runner : linux.4xlarge
179
+ runner : linux.2xlarge.memory
178
180
docker-image : executorch-ubuntu-22.04-qnn-sdk
179
181
submodules : ' true'
180
182
timeout : 60
181
183
upload-artifact : android-models
182
184
upload-artifact-to-s3 : true
185
+ secrets-env : EXECUTORCH_HF_TOKEN
183
186
script : |
184
187
# The generic Linux job chooses to use base env, not the one setup by the image
185
188
echo "::group::Setting up dev environment"
@@ -190,14 +193,109 @@ jobs:
190
193
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
191
194
fi
192
195
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
196
+ # Install requirements for export_llama
197
+ PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
193
198
ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
194
199
echo "::endgroup::"
195
200
196
201
echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
197
202
BUILD_MODE="cmake"
198
- DTYPE="fp32"
199
203
200
- if [[ ${{ matrix.model }} =~ ^stories* ]]; then
204
+ if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
205
+ pip install -U "huggingface_hub[cli]"
206
+ huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
207
+ pip install accelerate sentencepiece
208
+ # HuggingFace model. Assume the pattern is always like "<org>/<repo>"
209
+ HF_MODEL_REPO=${{ matrix.model }}
210
+ OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
211
+
212
+ if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
213
+ # Llama models on Hugging Face
214
+ if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
215
+ # SpinQuant
216
+ # Download prequantized chceckpoint from Hugging Face
217
+ DOWNLOADED_PATH=$(
218
+ bash .ci/scripts/download_hf_hub.sh \
219
+ --model_id "${HF_MODEL_REPO}" \
220
+ --files "tokenizer.model" "params.json" "consolidated.00.pth"
221
+ )
222
+ # Export using ExecuTorch's model definition
223
+ python -m examples.models.llama.export_llama \
224
+ --model "llama3_2" \
225
+ --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
226
+ --params "${DOWNLOADED_PATH}/params.json" \
227
+ --use_sdpa_with_kv_cache \
228
+ -X \
229
+ --xnnpack-extended-ops \
230
+ --preq_mode 8da4w_output_8da8w \
231
+ --preq_group_size 32 \
232
+ --max_seq_length 2048 \
233
+ --output_name "${OUT_ET_MODEL_NAME}.pte" \
234
+ -kv \
235
+ -d fp32 \
236
+ --preq_embedding_quantize 8,0 \
237
+ --use_spin_quant native \
238
+ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
239
+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
240
+ elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
241
+ # QAT + LoRA
242
+ # Download prequantized chceckpoint from Hugging Face
243
+ DOWNLOADED_PATH=$(
244
+ bash .ci/scripts/download_hf_hub.sh \
245
+ --model_id "${HF_MODEL_REPO}" \
246
+ --files "tokenizer.model" "params.json" "consolidated.00.pth"
247
+ )
248
+ # Export using ExecuTorch's model definition
249
+ python -m examples.models.llama.export_llama \
250
+ --model "llama3_2" \
251
+ --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
252
+ --params "${DOWNLOADED_PATH}/params.json" \
253
+ -qat \
254
+ -lora 16 \
255
+ --preq_mode 8da4w_output_8da8w \
256
+ --preq_group_size 32 \
257
+ --preq_embedding_quantize 8,0 \
258
+ --use_sdpa_with_kv_cache \
259
+ -kv \
260
+ -X \
261
+ --xnnpack-extended-ops \
262
+ -d fp32 \
263
+ --max_seq_length 2048 \
264
+ --output_name "${OUT_ET_MODEL_NAME}.pte" \
265
+ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
266
+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
267
+ else
268
+ if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
269
+ # Original BF16 version, without any quantization
270
+ DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
271
+ python -m examples.models.llama.export_llama \
272
+ --model "llama3_2" \
273
+ --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
274
+ --params "${DOWNLOADED_PATH}/params.json" \
275
+ -kv \
276
+ --use_sdpa_with_kv_cache \
277
+ -X \
278
+ -d bf16 \
279
+ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
280
+ --output_name="${OUT_ET_MODEL_NAME}.pte"
281
+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
282
+ else
283
+ # By default, test with the Hugging Face model and the xnnpack recipe
284
+ DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
285
+ python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
286
+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
287
+ fi
288
+ fi
289
+ else
290
+ echo "Unsupported model ${{ matrix.model }}"
291
+ exit 1
292
+ fi
293
+
294
+ zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
295
+ ls -lh model.zip
296
+ mkdir -p "${ARTIFACTS_DIR_NAME}"
297
+ mv model.zip "${ARTIFACTS_DIR_NAME}"
298
+ elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
201
299
# Install requirements for export_llama
202
300
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
203
301
# Test llama2
@@ -209,6 +307,7 @@ jobs:
209
307
echo "Unsupported delegate ${{ matrix.delegate }}"
210
308
exit 1
211
309
fi
310
+ DTYPE="fp32"
212
311
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \
213
312
-model "${{ matrix.model }}" \
214
313
-build_tool "${BUILD_MODE}" \
0 commit comments