Skip to content

Commit 34a1d42

Browse files
committed
Update base for Update on "[ET-VK] Replace Uniform buffers with push constants for copy op"
This diff replaces uniform buffers with push constants for copy op in the Vulkan backend of Executorch. The changes include updating the GLSL code to use push constants instead of uniform buffers and updating the C++ code to pass the sizes as push constants to the shader. Differential Revision: [D66890851](https://our.internmc.facebook.com/intern/diff/D66890851/) [ghstack-poisoned]
2 parents 0852c46 + eca5d9f commit 34a1d42

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+1006
-1125
lines changed

.ci/scripts/download_hf_hub.sh

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
#!/bin/bash
2+
3+
# Function to download files from the Hugging Face Hub
4+
# Arguments:
5+
# 1. model_id: The Hugging Face repository ID (e.g., "organization/model_name")
6+
# 2. subdir: The optional subdirectory in the repo to look for files (pass "" if not used)
7+
# 3. file_names: A space-separated list of filenames to be downloaded
8+
# Returns:
9+
# The directory containing the downloaded files
10+
function download_hf_files() {
11+
local model_id="$1"
12+
local subdir="$2"
13+
shift 2
14+
local file_names=("$@") # Capture all remaining arguments as an array
15+
16+
local download_dir
17+
18+
# Use the first file to determine the download directory
19+
download_dir=$(python3 -c "
20+
from huggingface_hub import hf_hub_download
21+
# Download the first file and get its directory
22+
path = hf_hub_download(
23+
repo_id='${model_id}',
24+
filename='${subdir:+${subdir}/}${file_names[0]}'
25+
)
26+
import os
27+
print(os.path.dirname(path))")
28+
29+
if [ $? -ne 0 ]; then
30+
echo "Error: Failed to determine download directory from ${file_names[0]}" >&2
31+
return 1
32+
fi
33+
34+
# Download remaining files into the same directory
35+
for file_name in "${file_names[@]:1}"; do
36+
python3 -c "
37+
from huggingface_hub import hf_hub_download
38+
# Download the file
39+
hf_hub_download(
40+
repo_id='${model_id}',
41+
filename='${subdir:+${subdir}/}${file_name}'
42+
)"
43+
44+
if [ $? -ne 0 ]; then
45+
echo "Error: Failed to download ${file_name} from ${model_id}" >&2
46+
return 1
47+
fi
48+
done
49+
50+
# Return the directory containing the downloaded files
51+
echo "$download_dir"
52+
}
53+
54+
# Check if script is called directly
55+
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
56+
# Parse arguments from CLI
57+
while [[ $# -gt 0 ]]; do
58+
case $1 in
59+
--model_id)
60+
MODEL_ID="$2"
61+
shift 2
62+
;;
63+
--subdir)
64+
SUBDIR="$2"
65+
shift 2
66+
;;
67+
--files)
68+
shift
69+
FILES_TO_DOWNLOAD=()
70+
while [[ $# -gt 0 && $1 != --* ]]; do
71+
FILES_TO_DOWNLOAD+=("$1")
72+
shift
73+
done
74+
;;
75+
*)
76+
echo "Unknown option: $1" >&2
77+
exit 1
78+
;;
79+
esac
80+
done
81+
82+
# Validate required arguments
83+
if [ -z "$MODEL_ID" ] || [ ${#FILES_TO_DOWNLOAD[@]} -eq 0 ]; then
84+
echo "Usage: $0 --model_id <model_id> --subdir <subdir> --files <file1> [<file2> ...]" >&2
85+
exit 1
86+
fi
87+
88+
# Call the function
89+
DOWNLOAD_DIR=$(download_hf_files "$MODEL_ID" "$SUBDIR" "${FILES_TO_DOWNLOAD[@]}")
90+
if [ $? -eq 0 ]; then
91+
echo "$DOWNLOAD_DIR"
92+
else
93+
exit 1
94+
fi
95+
fi

.github/workflows/android-perf.yml

Lines changed: 102 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ jobs:
108108
declare -A DEVICE_POOL_ARNS
109109
DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
110110
DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db"
111+
DEVICE_POOL_ARNS[google_pixel_8_pro]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a"
111112
112113
# Resolve device names with their corresponding ARNs
113114
if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
@@ -168,18 +169,20 @@ jobs:
168169
name: export-models
169170
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
170171
needs: set-parameters
172+
secrets: inherit
171173
strategy:
172174
matrix:
173175
model: ${{ fromJson(needs.set-parameters.outputs.models) }}
174176
delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
175177
fail-fast: false
176178
with:
177-
runner: linux.4xlarge
179+
runner: linux.2xlarge.memory
178180
docker-image: executorch-ubuntu-22.04-qnn-sdk
179181
submodules: 'true'
180182
timeout: 60
181183
upload-artifact: android-models
182184
upload-artifact-to-s3: true
185+
secrets-env: EXECUTORCH_HF_TOKEN
183186
script: |
184187
# The generic Linux job chooses to use base env, not the one setup by the image
185188
echo "::group::Setting up dev environment"
@@ -190,14 +193,109 @@ jobs:
190193
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
191194
fi
192195
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
196+
# Install requirements for export_llama
197+
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
193198
ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
194199
echo "::endgroup::"
195200
196201
echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
197202
BUILD_MODE="cmake"
198-
DTYPE="fp32"
199203
200-
if [[ ${{ matrix.model }} =~ ^stories* ]]; then
204+
if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
205+
pip install -U "huggingface_hub[cli]"
206+
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
207+
pip install accelerate sentencepiece
208+
# HuggingFace model. Assume the pattern is always like "<org>/<repo>"
209+
HF_MODEL_REPO=${{ matrix.model }}
210+
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
211+
212+
if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
213+
# Llama models on Hugging Face
214+
if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
215+
# SpinQuant
216+
# Download prequantized chceckpoint from Hugging Face
217+
DOWNLOADED_PATH=$(
218+
bash .ci/scripts/download_hf_hub.sh \
219+
--model_id "${HF_MODEL_REPO}" \
220+
--files "tokenizer.model" "params.json" "consolidated.00.pth"
221+
)
222+
# Export using ExecuTorch's model definition
223+
python -m examples.models.llama.export_llama \
224+
--model "llama3_2" \
225+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
226+
--params "${DOWNLOADED_PATH}/params.json" \
227+
--use_sdpa_with_kv_cache \
228+
-X \
229+
--xnnpack-extended-ops \
230+
--preq_mode 8da4w_output_8da8w \
231+
--preq_group_size 32 \
232+
--max_seq_length 2048 \
233+
--output_name "${OUT_ET_MODEL_NAME}.pte" \
234+
-kv \
235+
-d fp32 \
236+
--preq_embedding_quantize 8,0 \
237+
--use_spin_quant native \
238+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
239+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
240+
elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
241+
# QAT + LoRA
242+
# Download prequantized chceckpoint from Hugging Face
243+
DOWNLOADED_PATH=$(
244+
bash .ci/scripts/download_hf_hub.sh \
245+
--model_id "${HF_MODEL_REPO}" \
246+
--files "tokenizer.model" "params.json" "consolidated.00.pth"
247+
)
248+
# Export using ExecuTorch's model definition
249+
python -m examples.models.llama.export_llama \
250+
--model "llama3_2" \
251+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
252+
--params "${DOWNLOADED_PATH}/params.json" \
253+
-qat \
254+
-lora 16 \
255+
--preq_mode 8da4w_output_8da8w \
256+
--preq_group_size 32 \
257+
--preq_embedding_quantize 8,0 \
258+
--use_sdpa_with_kv_cache \
259+
-kv \
260+
-X \
261+
--xnnpack-extended-ops \
262+
-d fp32 \
263+
--max_seq_length 2048 \
264+
--output_name "${OUT_ET_MODEL_NAME}.pte" \
265+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
266+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
267+
else
268+
if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
269+
# Original BF16 version, without any quantization
270+
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
271+
python -m examples.models.llama.export_llama \
272+
--model "llama3_2" \
273+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
274+
--params "${DOWNLOADED_PATH}/params.json" \
275+
-kv \
276+
--use_sdpa_with_kv_cache \
277+
-X \
278+
-d bf16 \
279+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
280+
--output_name="${OUT_ET_MODEL_NAME}.pte"
281+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
282+
else
283+
# By default, test with the Hugging Face model and the xnnpack recipe
284+
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
285+
python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
286+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
287+
fi
288+
fi
289+
else
290+
echo "Unsupported model ${{ matrix.model }}"
291+
exit 1
292+
fi
293+
294+
zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
295+
ls -lh model.zip
296+
mkdir -p "${ARTIFACTS_DIR_NAME}"
297+
mv model.zip "${ARTIFACTS_DIR_NAME}"
298+
elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
201299
# Install requirements for export_llama
202300
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
203301
# Test llama2
@@ -209,6 +307,7 @@ jobs:
209307
echo "Unsupported delegate ${{ matrix.delegate }}"
210308
exit 1
211309
fi
310+
DTYPE="fp32"
212311
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \
213312
-model "${{ matrix.model }}" \
214313
-build_tool "${BUILD_MODE}" \

.github/workflows/apple-perf.yml

Lines changed: 101 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ jobs:
155155
name: export-models
156156
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
157157
needs: set-parameters
158+
secrets: inherit
158159
strategy:
159160
matrix:
160161
model: ${{ fromJson(needs.set-parameters.outputs.models) }}
@@ -168,6 +169,7 @@ jobs:
168169
timeout: 60
169170
upload-artifact: ios-models
170171
upload-artifact-to-s3: true
172+
secrets-env: EXECUTORCH_HF_TOKEN
171173
script: |
172174
set -eux
173175
@@ -189,14 +191,110 @@ jobs:
189191
backends/apple/mps/install_requirements.sh
190192
fi
191193
194+
# Install requirements for export_llama
195+
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
196+
192197
ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
193198
echo "::endgroup::"
194199
195200
echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
196201
BUILD_MODE="cmake"
197-
DTYPE="fp32"
198202
199-
if [[ ${{ matrix.model }} =~ ^stories* ]]; then
203+
if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
204+
pip install -U "huggingface_hub[cli]"
205+
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
206+
${CONDA_RUN} pip install accelerate sentencepiece
207+
# HuggingFace model. Assume the pattern is always like "<org>/<repo>"
208+
HF_MODEL_REPO=${{ matrix.model }}
209+
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
210+
211+
if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
212+
# Llama models on Hugging Face
213+
if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
214+
# SpinQuant
215+
# Download prequantized chceckpoint from Hugging Face
216+
DOWNLOADED_PATH=$(
217+
bash .ci/scripts/download_hf_hub.sh \
218+
--model_id "${HF_MODEL_REPO}" \
219+
--files "tokenizer.model" "params.json" "consolidated.00.pth"
220+
)
221+
# Export using ExecuTorch's model definition
222+
${CONDA_RUN} python -m examples.models.llama.export_llama \
223+
--model "llama3_2" \
224+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
225+
--params "${DOWNLOADED_PATH}/params.json" \
226+
--use_sdpa_with_kv_cache \
227+
-X \
228+
--xnnpack-extended-ops \
229+
--preq_mode 8da4w_output_8da8w \
230+
--preq_group_size 32 \
231+
--max_seq_length 2048 \
232+
--output_name "${OUT_ET_MODEL_NAME}.pte" \
233+
-kv \
234+
-d fp32 \
235+
--preq_embedding_quantize 8,0 \
236+
--use_spin_quant native \
237+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
238+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
239+
elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
240+
# QAT + LoRA
241+
# Download prequantized chceckpoint from Hugging Face
242+
DOWNLOADED_PATH=$(
243+
bash .ci/scripts/download_hf_hub.sh \
244+
--model_id "${HF_MODEL_REPO}" \
245+
--files "tokenizer.model" "params.json" "consolidated.00.pth"
246+
)
247+
# Export using ExecuTorch's model definition
248+
${CONDA_RUN} python -m examples.models.llama.export_llama \
249+
--model "llama3_2" \
250+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
251+
--params "${DOWNLOADED_PATH}/params.json" \
252+
-qat \
253+
-lora 16 \
254+
--preq_mode 8da4w_output_8da8w \
255+
--preq_group_size 32 \
256+
--preq_embedding_quantize 8,0 \
257+
--use_sdpa_with_kv_cache \
258+
-kv \
259+
-X \
260+
--xnnpack-extended-ops \
261+
-d fp32 \
262+
--max_seq_length 2048 \
263+
--output_name "${OUT_ET_MODEL_NAME}.pte" \
264+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
265+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
266+
else
267+
if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
268+
# Original BF16 version, without any quantization
269+
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
270+
${CONDA_RUN} python -m examples.models.llama.export_llama \
271+
--model "llama3_2" \
272+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
273+
--params "${DOWNLOADED_PATH}/params.json" \
274+
-kv \
275+
--use_sdpa_with_kv_cache \
276+
-X \
277+
-d bf16 \
278+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
279+
--output_name="${OUT_ET_MODEL_NAME}.pte"
280+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
281+
else
282+
# By default, test with the Hugging Face model and the xnnpack recipe
283+
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
284+
${CONDA_RUN} python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
285+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
286+
fi
287+
fi
288+
else
289+
echo "Unsupported model ${{ matrix.model }}"
290+
exit 1
291+
fi
292+
293+
zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
294+
ls -lh model.zip
295+
mkdir -p "${ARTIFACTS_DIR_NAME}"
296+
mv model.zip "${ARTIFACTS_DIR_NAME}"
297+
elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
200298
# Install requirements for export_llama
201299
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
202300
bash examples/models/llama/install_requirements.sh
@@ -209,6 +307,7 @@ jobs:
209307
elif [[ ${{ matrix.delegate }} == "mps" ]]; then
210308
DELEGATE_CONFIG="mps"
211309
fi
310+
DTYPE="fp32"
212311
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
213312
bash .ci/scripts/test_llama.sh \
214313
-model "${{ matrix.model }}" \

0 commit comments

Comments
 (0)