Skip to content

Commit 7990574

Browse files
committed
Merge branch 'main' into jz/hf-download
2 parents 2c6609a + 6ca39f8 commit 7990574

File tree

23 files changed

+366
-107
lines changed

23 files changed

+366
-107
lines changed

.ci/scripts/build_android_instrumentation.sh

Lines changed: 6 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,30 +12,10 @@ if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
1212
fi
1313
which "${PYTHON_EXECUTABLE}"
1414

15-
build_android_test() {
16-
mkdir -p extension/android/executorch_android/src/androidTest/resources
17-
cp extension/module/test/resources/add.pte extension/android/executorch_android/src/androidTest/resources
18-
pushd extension/android
19-
ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:testDebugUnitTest
20-
ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:assembleAndroidTest
21-
popd
22-
}
15+
mkdir -p "${BUILD_AAR_DIR}"/executorch_android/src/androidTest/resources
16+
cp extension/module/test/resources/add.pte "${BUILD_AAR_DIR}"/executorch_android/src/androidTest/resources
2317

24-
collect_artifacts_to_be_uploaded() {
25-
ARTIFACTS_DIR_NAME="$1"
26-
# Collect Java library test
27-
JAVA_LIBRARY_TEST_DIR="${ARTIFACTS_DIR_NAME}/library_test_dir"
28-
mkdir -p "${JAVA_LIBRARY_TEST_DIR}"
29-
cp extension/android/executorch_android/build/outputs/apk/androidTest/debug/*.apk "${JAVA_LIBRARY_TEST_DIR}"
30-
}
31-
32-
main() {
33-
build_android_test
34-
if [ -n "$ARTIFACTS_DIR_NAME" ]; then
35-
collect_artifacts_to_be_uploaded ${ARTIFACTS_DIR_NAME}
36-
fi
37-
}
38-
39-
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
40-
main "$@"
41-
fi
18+
pushd "${BUILD_AAR_DIR}"
19+
ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:testDebugUnitTest
20+
ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:assembleAndroidTest
21+
popd

.ci/scripts/utils.sh

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,46 @@ install_pytorch_and_domains() {
6060
# Fetch the target commit
6161
pushd pytorch || return
6262
git checkout "${TORCH_VERSION}"
63-
git submodule update --init --recursive
6463

65-
export USE_DISTRIBUTED=1
66-
# Then build and install PyTorch
67-
python setup.py bdist_wheel
68-
pip install "$(echo dist/*.whl)"
64+
local system_name=$(uname)
65+
if [[ "${system_name}" == "Darwin" ]]; then
66+
local platform=$(python -c 'import sysconfig; import platform; v=platform.mac_ver()[0].split(".")[0]; platform=sysconfig.get_platform().split("-"); platform[1]=f"{v}_0"; print("_".join(platform))')
67+
fi
68+
local python_version=$(python -c 'import platform; v=platform.python_version_tuple(); print(f"{v[0]}{v[1]}")')
69+
local torch_release=$(cat version.txt)
70+
local torch_short_hash=${TORCH_VERSION:0:7}
71+
local torch_wheel_path="cached_artifacts/pytorch/executorch/pytorch_wheels/${system_name}/${python_version}"
72+
local torch_wheel_name="torch-${torch_release}%2Bgit${torch_short_hash}-cp${python_version}-cp${python_version}-${platform:-}.whl"
73+
74+
local cached_torch_wheel="https://gha-artifacts.s3.us-east-1.amazonaws.com/${torch_wheel_path}/${torch_wheel_name}"
75+
# Cache PyTorch wheel is only needed on MacOS, Linux CI already has this as part
76+
# of the Docker image
77+
local torch_wheel_not_found=0
78+
if [[ "${system_name}" == "Darwin" ]]; then
79+
pip install "${cached_torch_wheel}" || torch_wheel_not_found=1
80+
else
81+
torch_wheel_not_found=1
82+
fi
83+
84+
# Found no such wheel, we will build it from source then
85+
if [[ "${torch_wheel_not_found}" == "1" ]]; then
86+
echo "No cached wheel found, continue with building PyTorch at ${TORCH_VERSION}"
87+
88+
git submodule update --init --recursive
89+
USE_DISTRIBUTED=1 python setup.py bdist_wheel
90+
pip install "$(echo dist/*.whl)"
91+
92+
# Only AWS runners have access to S3
93+
if command -v aws && [[ -z "${GITHUB_RUNNER:-}" ]]; then
94+
for wheel_path in dist/*.whl; do
95+
local wheel_name=$(basename "${wheel_path}")
96+
echo "Caching ${wheel_name}"
97+
aws s3 cp "${wheel_path}" "s3://gha-artifacts/${torch_wheel_path}/${wheel_name}"
98+
done
99+
fi
100+
else
101+
echo "Use cached wheel at ${cached_torch_wheel}"
102+
fi
69103

70104
# Grab the pinned audio and vision commits from PyTorch
71105
TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)

.github/workflows/_android.yml

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,21 @@ jobs:
2727
conda activate "${CONDA_ENV}"
2828
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool buck2
2929
export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
30-
31-
mkdir -p ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
32-
bash examples/models/llama/install_requirements.sh
33-
bash ".ci/scripts/test_llama.sh" -model stories110M -build_tool cmake -dtype fp16 -mode portable -upload ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
30+
mkdir -p ${ARTIFACTS_DIR_NAME}/
3431
3532
# Build LLM Demo for Android
3633
export BUILD_AAR_DIR=aar-out
3734
mkdir -p $BUILD_AAR_DIR
38-
bash scripts/build_android_library.sh ${ARTIFACTS_DIR_NAME}
39-
bash .ci/scripts/build_android_instrumentation.sh ${ARTIFACTS_DIR_NAME}
35+
bash scripts/build_android_library.sh
36+
cp ${BUILD_AAR_DIR}/executorch.aar $ARTIFACTS_DIR_NAME
37+
38+
mkdir -p ${ARTIFACTS_DIR_NAME}/library_test_dir
39+
bash .ci/scripts/build_android_instrumentation.sh
40+
cp ${BUILD_AAR_DIR}/executorch_android/build/outputs/apk/androidTest/debug/executorch_android-debug-androidTest.apk "${ARTIFACTS_DIR_NAME}/library_test_dir"
41+
42+
mkdir -p ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
43+
bash examples/models/llama/install_requirements.sh
44+
bash ".ci/scripts/test_llama.sh" -model stories110M -build_tool cmake -dtype fp16 -mode portable -upload ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
4045
4146
mkdir -p examples/demo-apps/android/LlamaDemo/app/libs
4247
cp aar-out/executorch.aar examples/demo-apps/android/LlamaDemo/app/libs
@@ -96,7 +101,7 @@ jobs:
96101
curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/app-debug.apk
97102
curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/app-debug-androidTest.apk
98103
curl -O https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/fp32-xnnpack-custom/model.zip
99-
curl -o android-test-debug-androidTest.apk https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/library_test_dir/executorch-debug-androidTest.apk
104+
curl -o android-test-debug-androidTest.apk https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/library_test_dir/executorch_android-debug-androidTest.apk
100105
unzip model.zip
101106
mv *.pte model.pte
102107

.github/workflows/_unittest.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,6 @@ jobs:
4949
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
5050
script: |
5151
set -eux
52+
# This is needed to get the prebuilt PyTorch wheel from S3
53+
${CONDA_RUN} --no-capture-output pip install awscli==1.37.21
5254
.ci/scripts/unittest-macos.sh --build-tool "${{ inputs.build-tool }}" --build-mode "${{ inputs.build-mode }}" --editable "${{ inputs.editable }}"

.github/workflows/trunk.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ jobs:
228228
name: test-coreml-delegate
229229
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
230230
with:
231-
runner: macos-13-xlarge
231+
runner: macos-latest-xlarge
232232
python-version: '3.11'
233233
submodules: 'true'
234234
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}

backends/arm/runtime/EthosUBackend.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
* ethos-u-core-driver for hardware interaction.
1111
*/
1212

13+
#include <cstdint>
1314
#include <cstring>
1415
#include <memory>
1516

@@ -282,7 +283,10 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
282283
// constant weight data, then scratch (which contains input and output)
283284
// scratch is written above in this function.
284285
uint64_t bases[2] = {
285-
(uint64_t)handles.weight_data, (uint64_t)handles.scratch_data};
286+
static_cast<uint64_t>(
287+
reinterpret_cast<uintptr_t>((handles.weight_data))),
288+
static_cast<uint64_t>(
289+
reinterpret_cast<uintptr_t>((handles.scratch_data)))};
286290
size_t bases_size[2] = {
287291
handles.weight_data_size, handles.scratch_data_size};
288292
int result = 0;

examples/models/llama/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,10 @@ python -m examples.models.llama.export_llama \
412412
-d fp32
413413
```
414414
415+
A few notes:
416+
- If your model shares embedding/unembedding weights (like Llama1B and Llama3B do), you can add `--use_shared_embedding` to take advantage of this and reduce memory. When this option is enabled, you can specify whether embeddings are quantized with weight zeros or not by specifying a third argument. For example, `-E "torchao:4,32,true"` means that the embedding is quantized to 4-bits with group_size=32 and uses weight zeros (this is the default behavior if you simply use `-E "torchao:4,32"`), whereas `-E "torchao:4,32,false"` means that the embedding is quantized to 4-bits with group_size=32, but is quantized with scales-only. If `--use_shared_embedding` is specified, the unembedding (i.e., the final linear layer) is quantized in the same way, but also uses 8-bit dynamically quantized activations.
417+
- To do channelwise quantization, specify group_size to 0. This works for both linear and embedding layers.
418+
415419
Once the model is exported, we need to build ExecuTorch and the runner with the low-bit kernels.
416420
417421
The first step is to install ExecuTorch (the same as step 3.1 above):

examples/models/llama/export_llama_lib.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@
9999
"static_llama",
100100
"qwen2_5",
101101
"phi_4_mini",
102+
"smollm2",
102103
]
103104
TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"]
104105
HUGGING_FACE_REPO_IDS = {
@@ -162,6 +163,11 @@ def build_args_parser() -> argparse.ArgumentParser:
162163
type=str,
163164
help="type of embedding quantization, '<bitwidth>,<groupsize>', e.g., '8,1024'.",
164165
)
166+
parser.add_argument(
167+
"--use_shared_embedding",
168+
action="store_true",
169+
help="Whether the embedding/unembedding weights should be shared. Only available with torchao kernels.",
170+
)
165171
parser.add_argument(
166172
"--pt2e_quantize",
167173
default=None,
@@ -705,6 +711,15 @@ def _validate_args(args):
705711
if args.num_sharding > 0 and not args.qnn:
706712
raise ValueError("Model shard is only supported with qnn backend now.")
707713

714+
if args.use_shared_embedding:
715+
if not (
716+
args.embedding_quantize is not None
717+
and args.embedding_quantize.startswith("torchao:")
718+
):
719+
raise ValueError(
720+
"Shared embedding is only supported with torchao quantization."
721+
)
722+
708723
if (
709724
args.quantization_mode is not None
710725
and args.quantization_mode.startswith("torchao:")
@@ -1143,6 +1158,21 @@ def _get_source_transforms( # noqa
11431158

11441159
transforms.append(inject_fast_hadamard_transform_native_for_spin_quant)
11451160

1161+
if args.embedding_quantize:
1162+
"""
1163+
When this option is selected, it finds all embedding layers and transforms
1164+
into quantized embedding equivalent module.
1165+
1166+
There are cases where the checkpoint is already quantized, for example
1167+
on use_spin_quant is enabled. In that case, it will do the appropriate
1168+
transformations based on the given checkpoint first. In those cases,
1169+
this wil be a no-op.
1170+
"""
1171+
modelname = f"{modelname}_e"
1172+
transforms.append(get_quant_embedding_transform(args, checkpoint_dtype))
1173+
1174+
# quantization_mode should be applied after embedding_quantize
1175+
# to support shared_embedding
11461176
if args.quantization_mode:
11471177
"""
11481178
When this option is selected, it finds all linear layers and transforms
@@ -1166,19 +1196,6 @@ def _get_source_transforms( # noqa
11661196
)
11671197
)
11681198

1169-
if args.embedding_quantize:
1170-
"""
1171-
When this option is selected, it finds all embedding layers and transforms
1172-
into quantized embedding equivalent module.
1173-
1174-
There are cases where the checkpoint is already quantized, for example
1175-
on use_spin_quant is enabled. In that case, it will do the appropriate
1176-
transformations based on the given checkpoint first. In those cases,
1177-
this wil be a no-op.
1178-
"""
1179-
modelname = f"{modelname}_e"
1180-
transforms.append(get_quant_embedding_transform(args, checkpoint_dtype))
1181-
11821199
if args.expand_rope_table:
11831200
transforms.append(materialze_broadcast_of_rope_freq_cis)
11841201

examples/models/llama/llama_test.py

Lines changed: 0 additions & 36 deletions
This file was deleted.

examples/models/llama/source_transformation/quantize.py

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -124,9 +124,7 @@ def quantize( # noqa C901
124124
model,
125125
Int8DynamicActivationIntxWeightConfig(
126126
weight_dtype=getattr(torch, f"int{bitwidth}"),
127-
granularity=(
128-
PerRow() if group_size in [0, -1] else PerGroup(group_size)
129-
),
127+
granularity=(PerRow() if group_size == 0 else PerGroup(group_size)),
130128
has_weight_zeros=False,
131129
),
132130
)
@@ -786,19 +784,43 @@ def forward(self, indices: torch.Tensor) -> torch.Tensor:
786784

787785
def get_quant_embedding_transform(args, dtype_override: Optional[DType] = None):
788786
if args.embedding_quantize.startswith("torchao:"):
789-
bitwidth, group_size = args.embedding_quantize.split(":")[1].split(",")
787+
from torchao.experimental.quant_api import (
788+
EmbeddingQuantizer,
789+
SharedEmbeddingQuantizer,
790+
)
791+
from torchao.quantization.granularity import PerGroup, PerRow
792+
793+
quant_args = args.embedding_quantize.split(":")[1].split(",")
794+
if len(quant_args) == 2:
795+
bitwidth, group_size = quant_args
796+
has_weight_zeros = True
797+
else:
798+
bitwidth, group_size, has_weight_zeros = quant_args
799+
800+
if group_size in ["none", "None", "0"]:
801+
group_size = 0
802+
790803
group_size = int(group_size)
791804
bitwidth = int(bitwidth)
792-
from torchao.experimental.quant_api import IntxWeightEmbeddingQuantizer
805+
has_weight_zeros = bool(has_weight_zeros)
806+
weight_dtype = getattr(torch, f"int{bitwidth}")
807+
granularity = PerRow() if group_size == 0 else PerGroup(group_size)
793808

794809
def _torchao_embedding_quantizer(model):
795810
with torch.no_grad():
796-
model = IntxWeightEmbeddingQuantizer(
797-
device="cpu",
798-
precision=torch.float32,
799-
bitwidth=bitwidth,
800-
groupsize=group_size,
801-
).quantize(model)
811+
if not args.use_shared_embedding:
812+
EmbeddingQuantizer(
813+
weight_dtype=weight_dtype,
814+
granularity=granularity,
815+
has_weight_zeros=has_weight_zeros,
816+
use_fallback=False,
817+
).quantize(model)
818+
else:
819+
SharedEmbeddingQuantizer(
820+
weight_dtype=weight_dtype,
821+
granularity=granularity,
822+
has_weight_zeros=has_weight_zeros,
823+
).quantize(model)
802824
return model
803825

804826
return _torchao_embedding_quantizer

0 commit comments

Comments
 (0)