Skip to content

Commit 72bb7b7

Browse files
guangy10Guang YangGithub Executorch
authored
Add HuggingFace Llama3.2 1B to benchmark (#5368)
* Add compatible HuggingFace models to benchmark workflow * Replace ones with rand to workaround the crash from sdpa kernel --------- Co-authored-by: Guang Yang <[email protected]> Co-authored-by: Github Executorch <[email protected]>
1 parent dabb14e commit 72bb7b7

File tree

7 files changed

+308
-23
lines changed

7 files changed

+308
-23
lines changed

.ci/scripts/download_hf_hub.sh

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
#!/bin/bash
2+
3+
# Function to download files from the Hugging Face Hub
4+
# Arguments:
5+
# 1. model_id: The Hugging Face repository ID (e.g., "organization/model_name")
6+
# 2. subdir: The optional subdirectory in the repo to look for files (pass "" if not used)
7+
# 3. file_names: A space-separated list of filenames to be downloaded
8+
# Returns:
9+
# The directory containing the downloaded files
10+
function download_hf_files() {
11+
local model_id="$1"
12+
local subdir="$2"
13+
shift 2
14+
local file_names=("$@") # Capture all remaining arguments as an array
15+
16+
local download_dir
17+
18+
# Use the first file to determine the download directory
19+
download_dir=$(python3 -c "
20+
from huggingface_hub import hf_hub_download
21+
# Download the first file and get its directory
22+
path = hf_hub_download(
23+
repo_id='${model_id}',
24+
filename='${subdir:+${subdir}/}${file_names[0]}'
25+
)
26+
import os
27+
print(os.path.dirname(path))")
28+
29+
if [ $? -ne 0 ]; then
30+
echo "Error: Failed to determine download directory from ${file_names[0]}" >&2
31+
return 1
32+
fi
33+
34+
# Download remaining files into the same directory
35+
for file_name in "${file_names[@]:1}"; do
36+
python3 -c "
37+
from huggingface_hub import hf_hub_download
38+
# Download the file
39+
hf_hub_download(
40+
repo_id='${model_id}',
41+
filename='${subdir:+${subdir}/}${file_name}'
42+
)"
43+
44+
if [ $? -ne 0 ]; then
45+
echo "Error: Failed to download ${file_name} from ${model_id}" >&2
46+
return 1
47+
fi
48+
done
49+
50+
# Return the directory containing the downloaded files
51+
echo "$download_dir"
52+
}
53+
54+
# Check if script is called directly
55+
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
56+
# Parse arguments from CLI
57+
while [[ $# -gt 0 ]]; do
58+
case $1 in
59+
--model_id)
60+
MODEL_ID="$2"
61+
shift 2
62+
;;
63+
--subdir)
64+
SUBDIR="$2"
65+
shift 2
66+
;;
67+
--files)
68+
shift
69+
FILES_TO_DOWNLOAD=()
70+
while [[ $# -gt 0 && $1 != --* ]]; do
71+
FILES_TO_DOWNLOAD+=("$1")
72+
shift
73+
done
74+
;;
75+
*)
76+
echo "Unknown option: $1" >&2
77+
exit 1
78+
;;
79+
esac
80+
done
81+
82+
# Validate required arguments
83+
if [ -z "$MODEL_ID" ] || [ ${#FILES_TO_DOWNLOAD[@]} -eq 0 ]; then
84+
echo "Usage: $0 --model_id <model_id> --subdir <subdir> --files <file1> [<file2> ...]" >&2
85+
exit 1
86+
fi
87+
88+
# Call the function
89+
DOWNLOAD_DIR=$(download_hf_files "$MODEL_ID" "$SUBDIR" "${FILES_TO_DOWNLOAD[@]}")
90+
if [ $? -eq 0 ]; then
91+
echo "$DOWNLOAD_DIR"
92+
else
93+
exit 1
94+
fi
95+
fi

.github/workflows/android-perf.yml

Lines changed: 102 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ jobs:
108108
declare -A DEVICE_POOL_ARNS
109109
DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
110110
DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db"
111+
DEVICE_POOL_ARNS[google_pixel_8_pro]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a"
111112
112113
# Resolve device names with their corresponding ARNs
113114
if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
@@ -168,18 +169,20 @@ jobs:
168169
name: export-models
169170
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
170171
needs: set-parameters
172+
secrets: inherit
171173
strategy:
172174
matrix:
173175
model: ${{ fromJson(needs.set-parameters.outputs.models) }}
174176
delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
175177
fail-fast: false
176178
with:
177-
runner: linux.4xlarge
179+
runner: linux.2xlarge.memory
178180
docker-image: executorch-ubuntu-22.04-qnn-sdk
179181
submodules: 'true'
180182
timeout: 60
181183
upload-artifact: android-models
182184
upload-artifact-to-s3: true
185+
secrets-env: EXECUTORCH_HF_TOKEN
183186
script: |
184187
# The generic Linux job chooses to use base env, not the one setup by the image
185188
echo "::group::Setting up dev environment"
@@ -190,14 +193,109 @@ jobs:
190193
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
191194
fi
192195
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
196+
# Install requirements for export_llama
197+
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
193198
ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
194199
echo "::endgroup::"
195200
196201
echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
197202
BUILD_MODE="cmake"
198-
DTYPE="fp32"
199203
200-
if [[ ${{ matrix.model }} =~ ^stories* ]]; then
204+
if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
205+
pip install -U "huggingface_hub[cli]"
206+
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
207+
pip install accelerate sentencepiece
208+
# HuggingFace model. Assume the pattern is always like "<org>/<repo>"
209+
HF_MODEL_REPO=${{ matrix.model }}
210+
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
211+
212+
if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
213+
# Llama models on Hugging Face
214+
if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
215+
# SpinQuant
216+
# Download prequantized chceckpoint from Hugging Face
217+
DOWNLOADED_PATH=$(
218+
bash .ci/scripts/download_hf_hub.sh \
219+
--model_id "${HF_MODEL_REPO}" \
220+
--files "tokenizer.model" "params.json" "consolidated.00.pth"
221+
)
222+
# Export using ExecuTorch's model definition
223+
python -m examples.models.llama.export_llama \
224+
--model "llama3_2" \
225+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
226+
--params "${DOWNLOADED_PATH}/params.json" \
227+
--use_sdpa_with_kv_cache \
228+
-X \
229+
--xnnpack-extended-ops \
230+
--preq_mode 8da4w_output_8da8w \
231+
--preq_group_size 32 \
232+
--max_seq_length 2048 \
233+
--output_name "${OUT_ET_MODEL_NAME}.pte" \
234+
-kv \
235+
-d fp32 \
236+
--preq_embedding_quantize 8,0 \
237+
--use_spin_quant native \
238+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
239+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
240+
elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
241+
# QAT + LoRA
242+
# Download prequantized chceckpoint from Hugging Face
243+
DOWNLOADED_PATH=$(
244+
bash .ci/scripts/download_hf_hub.sh \
245+
--model_id "${HF_MODEL_REPO}" \
246+
--files "tokenizer.model" "params.json" "consolidated.00.pth"
247+
)
248+
# Export using ExecuTorch's model definition
249+
python -m examples.models.llama.export_llama \
250+
--model "llama3_2" \
251+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
252+
--params "${DOWNLOADED_PATH}/params.json" \
253+
-qat \
254+
-lora 16 \
255+
--preq_mode 8da4w_output_8da8w \
256+
--preq_group_size 32 \
257+
--preq_embedding_quantize 8,0 \
258+
--use_sdpa_with_kv_cache \
259+
-kv \
260+
-X \
261+
--xnnpack-extended-ops \
262+
-d fp32 \
263+
--max_seq_length 2048 \
264+
--output_name "${OUT_ET_MODEL_NAME}.pte" \
265+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
266+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
267+
else
268+
if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
269+
# Original BF16 version, without any quantization
270+
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
271+
python -m examples.models.llama.export_llama \
272+
--model "llama3_2" \
273+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
274+
--params "${DOWNLOADED_PATH}/params.json" \
275+
-kv \
276+
--use_sdpa_with_kv_cache \
277+
-X \
278+
-d bf16 \
279+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
280+
--output_name="${OUT_ET_MODEL_NAME}.pte"
281+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
282+
else
283+
# By default, test with the Hugging Face model and the xnnpack recipe
284+
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
285+
python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
286+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
287+
fi
288+
fi
289+
else
290+
echo "Unsupported model ${{ matrix.model }}"
291+
exit 1
292+
fi
293+
294+
zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
295+
ls -lh model.zip
296+
mkdir -p "${ARTIFACTS_DIR_NAME}"
297+
mv model.zip "${ARTIFACTS_DIR_NAME}"
298+
elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
201299
# Install requirements for export_llama
202300
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
203301
# Test llama2
@@ -209,6 +307,7 @@ jobs:
209307
echo "Unsupported delegate ${{ matrix.delegate }}"
210308
exit 1
211309
fi
310+
DTYPE="fp32"
212311
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \
213312
-model "${{ matrix.model }}" \
214313
-build_tool "${BUILD_MODE}" \

.github/workflows/apple-perf.yml

Lines changed: 101 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ jobs:
155155
name: export-models
156156
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
157157
needs: set-parameters
158+
secrets: inherit
158159
strategy:
159160
matrix:
160161
model: ${{ fromJson(needs.set-parameters.outputs.models) }}
@@ -168,6 +169,7 @@ jobs:
168169
timeout: 60
169170
upload-artifact: ios-models
170171
upload-artifact-to-s3: true
172+
secrets-env: EXECUTORCH_HF_TOKEN
171173
script: |
172174
set -eux
173175
@@ -189,14 +191,110 @@ jobs:
189191
backends/apple/mps/install_requirements.sh
190192
fi
191193
194+
# Install requirements for export_llama
195+
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
196+
192197
ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
193198
echo "::endgroup::"
194199
195200
echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
196201
BUILD_MODE="cmake"
197-
DTYPE="fp32"
198202
199-
if [[ ${{ matrix.model }} =~ ^stories* ]]; then
203+
if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
204+
pip install -U "huggingface_hub[cli]"
205+
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
206+
${CONDA_RUN} pip install accelerate sentencepiece
207+
# HuggingFace model. Assume the pattern is always like "<org>/<repo>"
208+
HF_MODEL_REPO=${{ matrix.model }}
209+
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
210+
211+
if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
212+
# Llama models on Hugging Face
213+
if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
214+
# SpinQuant
215+
# Download prequantized chceckpoint from Hugging Face
216+
DOWNLOADED_PATH=$(
217+
bash .ci/scripts/download_hf_hub.sh \
218+
--model_id "${HF_MODEL_REPO}" \
219+
--files "tokenizer.model" "params.json" "consolidated.00.pth"
220+
)
221+
# Export using ExecuTorch's model definition
222+
${CONDA_RUN} python -m examples.models.llama.export_llama \
223+
--model "llama3_2" \
224+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
225+
--params "${DOWNLOADED_PATH}/params.json" \
226+
--use_sdpa_with_kv_cache \
227+
-X \
228+
--xnnpack-extended-ops \
229+
--preq_mode 8da4w_output_8da8w \
230+
--preq_group_size 32 \
231+
--max_seq_length 2048 \
232+
--output_name "${OUT_ET_MODEL_NAME}.pte" \
233+
-kv \
234+
-d fp32 \
235+
--preq_embedding_quantize 8,0 \
236+
--use_spin_quant native \
237+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
238+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
239+
elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
240+
# QAT + LoRA
241+
# Download prequantized chceckpoint from Hugging Face
242+
DOWNLOADED_PATH=$(
243+
bash .ci/scripts/download_hf_hub.sh \
244+
--model_id "${HF_MODEL_REPO}" \
245+
--files "tokenizer.model" "params.json" "consolidated.00.pth"
246+
)
247+
# Export using ExecuTorch's model definition
248+
${CONDA_RUN} python -m examples.models.llama.export_llama \
249+
--model "llama3_2" \
250+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
251+
--params "${DOWNLOADED_PATH}/params.json" \
252+
-qat \
253+
-lora 16 \
254+
--preq_mode 8da4w_output_8da8w \
255+
--preq_group_size 32 \
256+
--preq_embedding_quantize 8,0 \
257+
--use_sdpa_with_kv_cache \
258+
-kv \
259+
-X \
260+
--xnnpack-extended-ops \
261+
-d fp32 \
262+
--max_seq_length 2048 \
263+
--output_name "${OUT_ET_MODEL_NAME}.pte" \
264+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
265+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
266+
else
267+
if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
268+
# Original BF16 version, without any quantization
269+
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
270+
${CONDA_RUN} python -m examples.models.llama.export_llama \
271+
--model "llama3_2" \
272+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
273+
--params "${DOWNLOADED_PATH}/params.json" \
274+
-kv \
275+
--use_sdpa_with_kv_cache \
276+
-X \
277+
-d bf16 \
278+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
279+
--output_name="${OUT_ET_MODEL_NAME}.pte"
280+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
281+
else
282+
# By default, test with the Hugging Face model and the xnnpack recipe
283+
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
284+
${CONDA_RUN} python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
285+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
286+
fi
287+
fi
288+
else
289+
echo "Unsupported model ${{ matrix.model }}"
290+
exit 1
291+
fi
292+
293+
zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
294+
ls -lh model.zip
295+
mkdir -p "${ARTIFACTS_DIR_NAME}"
296+
mv model.zip "${ARTIFACTS_DIR_NAME}"
297+
elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
200298
# Install requirements for export_llama
201299
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
202300
bash examples/models/llama/install_requirements.sh
@@ -209,6 +307,7 @@ jobs:
209307
elif [[ ${{ matrix.delegate }} == "mps" ]]; then
210308
DELEGATE_CONFIG="mps"
211309
fi
310+
DTYPE="fp32"
212311
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
213312
bash .ci/scripts/test_llama.sh \
214313
-model "${{ matrix.model }}" \

0 commit comments

Comments
 (0)