Skip to content

Add HuggingFace Llama3.2 1B to benchmark #5368

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions .ci/scripts/download_hf_hub.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/bin/bash

# Function to download files from the Hugging Face Hub
# Arguments:
# 1. model_id: The Hugging Face repository ID (e.g., "organization/model_name")
# 2. subdir: The optional subdirectory in the repo to look for files (pass "" if not used)
# 3. file_names: A space-separated list of filenames to be downloaded
# Returns:
# The directory containing the downloaded files
function download_hf_files() {
local model_id="$1"
local subdir="$2"
shift 2
local file_names=("$@") # Capture all remaining arguments as an array

local download_dir

# Use the first file to determine the download directory
download_dir=$(python3 -c "
from huggingface_hub import hf_hub_download
# Download the first file and get its directory
path = hf_hub_download(
repo_id='${model_id}',
filename='${subdir:+${subdir}/}${file_names[0]}'
)
import os
print(os.path.dirname(path))")

if [ $? -ne 0 ]; then
echo "Error: Failed to determine download directory from ${file_names[0]}" >&2
return 1
fi

# Download remaining files into the same directory
for file_name in "${file_names[@]:1}"; do
python3 -c "
from huggingface_hub import hf_hub_download
# Download the file
hf_hub_download(
repo_id='${model_id}',
filename='${subdir:+${subdir}/}${file_name}'
)"

if [ $? -ne 0 ]; then
echo "Error: Failed to download ${file_name} from ${model_id}" >&2
return 1
fi
done

# Return the directory containing the downloaded files
echo "$download_dir"
}

# Check if script is called directly
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
# Parse arguments from CLI
while [[ $# -gt 0 ]]; do
case $1 in
--model_id)
MODEL_ID="$2"
shift 2
;;
--subdir)
SUBDIR="$2"
shift 2
;;
--files)
shift
FILES_TO_DOWNLOAD=()
while [[ $# -gt 0 && $1 != --* ]]; do
FILES_TO_DOWNLOAD+=("$1")
shift
done
;;
*)
echo "Unknown option: $1" >&2
exit 1
;;
esac
done

# Validate required arguments
if [ -z "$MODEL_ID" ] || [ ${#FILES_TO_DOWNLOAD[@]} -eq 0 ]; then
echo "Usage: $0 --model_id <model_id> --subdir <subdir> --files <file1> [<file2> ...]" >&2
exit 1
fi

# Call the function
DOWNLOAD_DIR=$(download_hf_files "$MODEL_ID" "$SUBDIR" "${FILES_TO_DOWNLOAD[@]}")
if [ $? -eq 0 ]; then
echo "$DOWNLOAD_DIR"
else
exit 1
fi
fi
105 changes: 102 additions & 3 deletions .github/workflows/android-perf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ jobs:
declare -A DEVICE_POOL_ARNS
DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db"
DEVICE_POOL_ARNS[google_pixel_8_pro]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a"

# Resolve device names with their corresponding ARNs
if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
Expand Down Expand Up @@ -168,18 +169,20 @@ jobs:
name: export-models
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
needs: set-parameters
secrets: inherit
strategy:
matrix:
model: ${{ fromJson(needs.set-parameters.outputs.models) }}
delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
fail-fast: false
with:
runner: linux.4xlarge
runner: linux.2xlarge.memory
docker-image: executorch-ubuntu-22.04-qnn-sdk
submodules: 'true'
timeout: 60
upload-artifact: android-models
upload-artifact-to-s3: true
secrets-env: EXECUTORCH_HF_TOKEN
script: |
# The generic Linux job chooses to use base env, not the one setup by the image
echo "::group::Setting up dev environment"
Expand All @@ -190,14 +193,109 @@ jobs:
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
fi
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
# Install requirements for export_llama
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
echo "::endgroup::"

echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
BUILD_MODE="cmake"
DTYPE="fp32"

if [[ ${{ matrix.model }} =~ ^stories* ]]; then
if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
pip install -U "huggingface_hub[cli]"
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
pip install accelerate sentencepiece
# HuggingFace model. Assume the pattern is always like "<org>/<repo>"
HF_MODEL_REPO=${{ matrix.model }}
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"

if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
# Llama models on Hugging Face
if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
# SpinQuant
# Download prequantized chceckpoint from Hugging Face
DOWNLOADED_PATH=$(
bash .ci/scripts/download_hf_hub.sh \
--model_id "${HF_MODEL_REPO}" \
--files "tokenizer.model" "params.json" "consolidated.00.pth"
)
# Export using ExecuTorch's model definition
python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
--use_sdpa_with_kv_cache \
-X \
--xnnpack-extended-ops \
--preq_mode 8da4w_output_8da8w \
--preq_group_size 32 \
--max_seq_length 2048 \
--output_name "${OUT_ET_MODEL_NAME}.pte" \
-kv \
-d fp32 \
--preq_embedding_quantize 8,0 \
--use_spin_quant native \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
# QAT + LoRA
# Download prequantized chceckpoint from Hugging Face
DOWNLOADED_PATH=$(
bash .ci/scripts/download_hf_hub.sh \
--model_id "${HF_MODEL_REPO}" \
--files "tokenizer.model" "params.json" "consolidated.00.pth"
)
# Export using ExecuTorch's model definition
python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
-qat \
-lora 16 \
--preq_mode 8da4w_output_8da8w \
--preq_group_size 32 \
--preq_embedding_quantize 8,0 \
--use_sdpa_with_kv_cache \
-kv \
-X \
--xnnpack-extended-ops \
-d fp32 \
--max_seq_length 2048 \
--output_name "${OUT_ET_MODEL_NAME}.pte" \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
else
if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
# Original BF16 version, without any quantization
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
-kv \
--use_sdpa_with_kv_cache \
-X \
-d bf16 \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
--output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
else
# By default, test with the Hugging Face model and the xnnpack recipe
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
fi
fi
else
echo "Unsupported model ${{ matrix.model }}"
exit 1
fi

zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
ls -lh model.zip
mkdir -p "${ARTIFACTS_DIR_NAME}"
mv model.zip "${ARTIFACTS_DIR_NAME}"
elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
# Install requirements for export_llama
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
# Test llama2
Expand All @@ -209,6 +307,7 @@ jobs:
echo "Unsupported delegate ${{ matrix.delegate }}"
exit 1
fi
DTYPE="fp32"
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \
-model "${{ matrix.model }}" \
-build_tool "${BUILD_MODE}" \
Expand Down
103 changes: 101 additions & 2 deletions .github/workflows/apple-perf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ jobs:
name: export-models
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
needs: set-parameters
secrets: inherit
strategy:
matrix:
model: ${{ fromJson(needs.set-parameters.outputs.models) }}
Expand All @@ -168,6 +169,7 @@ jobs:
timeout: 60
upload-artifact: ios-models
upload-artifact-to-s3: true
secrets-env: EXECUTORCH_HF_TOKEN
script: |
set -eux

Expand All @@ -189,14 +191,110 @@ jobs:
backends/apple/mps/install_requirements.sh
fi

# Install requirements for export_llama
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh

ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
echo "::endgroup::"

echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
BUILD_MODE="cmake"
DTYPE="fp32"

if [[ ${{ matrix.model }} =~ ^stories* ]]; then
if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
pip install -U "huggingface_hub[cli]"
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
${CONDA_RUN} pip install accelerate sentencepiece
# HuggingFace model. Assume the pattern is always like "<org>/<repo>"
HF_MODEL_REPO=${{ matrix.model }}
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"

if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
# Llama models on Hugging Face
if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
# SpinQuant
# Download prequantized chceckpoint from Hugging Face
DOWNLOADED_PATH=$(
bash .ci/scripts/download_hf_hub.sh \
--model_id "${HF_MODEL_REPO}" \
--files "tokenizer.model" "params.json" "consolidated.00.pth"
)
# Export using ExecuTorch's model definition
${CONDA_RUN} python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
--use_sdpa_with_kv_cache \
-X \
--xnnpack-extended-ops \
--preq_mode 8da4w_output_8da8w \
--preq_group_size 32 \
--max_seq_length 2048 \
--output_name "${OUT_ET_MODEL_NAME}.pte" \
-kv \
-d fp32 \
--preq_embedding_quantize 8,0 \
--use_spin_quant native \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
# QAT + LoRA
# Download prequantized chceckpoint from Hugging Face
DOWNLOADED_PATH=$(
bash .ci/scripts/download_hf_hub.sh \
--model_id "${HF_MODEL_REPO}" \
--files "tokenizer.model" "params.json" "consolidated.00.pth"
)
# Export using ExecuTorch's model definition
${CONDA_RUN} python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
-qat \
-lora 16 \
--preq_mode 8da4w_output_8da8w \
--preq_group_size 32 \
--preq_embedding_quantize 8,0 \
--use_sdpa_with_kv_cache \
-kv \
-X \
--xnnpack-extended-ops \
-d fp32 \
--max_seq_length 2048 \
--output_name "${OUT_ET_MODEL_NAME}.pte" \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
ls -lh "${OUT_ET_MODEL_NAME}.pte"
else
if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
# Original BF16 version, without any quantization
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
${CONDA_RUN} python -m examples.models.llama.export_llama \
--model "llama3_2" \
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
--params "${DOWNLOADED_PATH}/params.json" \
-kv \
--use_sdpa_with_kv_cache \
-X \
-d bf16 \
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
--output_name="${OUT_ET_MODEL_NAME}.pte"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
else
# By default, test with the Hugging Face model and the xnnpack recipe
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
${CONDA_RUN} python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
ls -lh "${OUT_ET_MODEL_NAME}.pte"
fi
fi
else
echo "Unsupported model ${{ matrix.model }}"
exit 1
fi

zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
ls -lh model.zip
mkdir -p "${ARTIFACTS_DIR_NAME}"
mv model.zip "${ARTIFACTS_DIR_NAME}"
elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
# Install requirements for export_llama
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
bash examples/models/llama/install_requirements.sh
Expand All @@ -209,6 +307,7 @@ jobs:
elif [[ ${{ matrix.delegate }} == "mps" ]]; then
DELEGATE_CONFIG="mps"
fi
DTYPE="fp32"
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
bash .ci/scripts/test_llama.sh \
-model "${{ matrix.model }}" \
Expand Down
Loading
Loading