Skip to content

Commit 9ff2e7d

Browse files
authored
Merge branch 'main' into angelayi/aoti_python
2 parents 5ca0943 + 76c1cd2 commit 9ff2e7d

File tree

83 files changed

+3767
-652
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+3767
-652
lines changed

.ci/scripts/run-docs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ fi
77

88
if [ "$1" == "readme" ]; then
99
echo "::group::Create script to run README"
10-
python3 torchchat/utils/scripts/updown.py --create-sections --file README.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
10+
python3 torchchat/utils/scripts/updown.py --create-sections --file README.md --replace 'llama3.1:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
1111
# for good measure, if something happened to updown processor,
1212
# and it did not error out, fail with an exit 1
1313
echo "exit 1" >> ./run-readme.sh

.github/workflows/pull.yml

Lines changed: 6 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,32 +1092,11 @@ jobs:
10921092
id: install-torchao-ops
10931093
run: |
10941094
bash torchchat/utils/scripts/build_torchao_ops.sh
1095-
- name: Set git shas
1096-
id: setup-hash
1097-
run: |
1098-
export TORCHCHAT_ROOT=${PWD}
1099-
echo "et-git-hash=$(cat ${TORCHCHAT_ROOT}/install/.pins/et-pin.txt)" >> "$GITHUB_ENV"
1100-
- name: Load or install ET
1101-
id: install-et
1102-
uses: actions/cache@v4
1103-
with:
1104-
path: |
1105-
./et-build
1106-
./torchchat/utils/scripts/install_et.sh
1107-
key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh') }}
1108-
- if: ${{ steps.install-et.outputs.cache-hit != 'true' }}
1109-
continue-on-error: true
1095+
- name: Install ET
11101096
run: |
11111097
echo "Installing ExecuTorch"
1098+
export TORCHCHAT_ROOT=${PWD}
11121099
bash torchchat/utils/scripts/install_et.sh
1113-
- name: Install ExecuTorch python
1114-
run: |
1115-
echo "Install ExecuTorch python"
1116-
export TORCHCHAT_ROOT=$PWD
1117-
export ET_BUILD_DIR="et-build"
1118-
ENABLE_ET_PYBIND="${1:-true}"
1119-
source "torchchat/utils/scripts/install_utils.sh"
1120-
install_executorch_python_libs $ENABLE_ET_PYBIND
11211100
- name: Install runner
11221101
run: |
11231102
echo "Installing runner"
@@ -1132,14 +1111,14 @@ jobs:
11321111
wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
11331112
export PRMT="Once upon a time in a land far away"
11341113
echo "Generate eager"
1135-
python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
1114+
python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
11361115
echo "Generate compile"
1137-
python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile
1116+
python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile
11381117
echo "Export and run ET (C++ runner)"
1139-
python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
1118+
python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
11401119
./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
11411120
echo "Export and run AOTI (C++ runner)"
1142-
python torchchat.py export stories110M --output-dso-path ./model.so --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
1121+
python torchchat.py export stories110M --output-dso-path ./model.so --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
11431122
./cmake-out/aoti_run ./model.so -z ./tokenizer.model -t 0 -i "${PRMT}"
11441123
echo "Generate AOTI"
11451124
python torchchat.py generate stories110M --dso-path ./model.so --prompt "${PRMT}"

README.md

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ python3 torchchat.py download llama3.1
171171
<summary>Additional Model Inventory Management Commands</summary>
172172

173173
### Where
174-
This subcommand shows location of a particular model.
174+
This subcommand shows the location of a particular model.
175175
```bash
176176
python3 torchchat.py where llama3.1
177177
```
@@ -216,7 +216,6 @@ This mode generates text based on an input prompt.
216216
python3 torchchat.py generate llama3.1 --prompt "write me a story about a boy and his bear"
217217
```
218218

219-
[skip default]: end
220219

221220
### Server
222221
This mode exposes a REST API for interacting with a model.
@@ -286,6 +285,8 @@ First, follow the steps in the Server section above to start a local server. The
286285
streamlit run torchchat/usages/browser.py
287286
```
288287

288+
[skip default]: end
289+
289290
Use the "Max Response Tokens" slider to limit the maximum number of tokens generated by the model for each response. Click the "Reset Chat" button to remove the message history and start a fresh chat.
290291

291292

@@ -295,6 +296,7 @@ Use the "Max Response Tokens" slider to limit the maximum number of tokens gener
295296
[AOTI](https://pytorch.org/blog/pytorch2-2/) compiles models before execution for faster inference. The process creates a zipped PT2 file containing all the artifacts generated by AOTInductor, and a [.so](https://en.wikipedia.org/wiki/Shared_library) file with the runnable contents
296297
that is then loaded for inference. This can be done with both Python and C++ enviroments.
297298

299+
298300
The following example exports and executes the Llama3.1 8B Instruct
299301
model. The first command compiles and performs the actual export.
300302

@@ -309,7 +311,7 @@ python3 torchchat.py export llama3.1 --output-aoti-package-path exportedModels/l
309311
For more details on quantization and what settings to use for your use
310312
case visit our [customization guide](docs/model_customization.md).
311313

312-
### Run in a Python Enviroment
314+
### Run in a Python Environment
313315

314316
To run in a python enviroment, use the generate subcommand like before, but include the pt2 file.
315317

@@ -376,7 +378,7 @@ While ExecuTorch does not focus on desktop inference, it is capable
376378
of doing so. This is handy for testing out PTE
377379
models without sending them to a physical device.
378380

379-
Specifically there are 2 ways of doing so: Pure Python and via a Runner
381+
Specifically, there are 2 ways of doing so: Pure Python and via a Runner
380382

381383
<details>
382384
<summary>Deploying via Python</summary>
@@ -476,9 +478,9 @@ The following assumes you've completed the steps for [Setting up ExecuTorch](#se
476478

477479
1. Download the AAR file, which contains the Java library and corresponding JNI library, to build and run the app.
478480

479-
- [executorch-240919.aar](https://ossci-android.s3.amazonaws.com/executorch/main/executorch-240919.aar) (SHASUM: c8a5d38ead03bfa28ee8469f6355840ad0d182ba)
481+
- [executorch.aar](https://ossci-android.s3.amazonaws.com/executorch/release/executorch-241002/executorch.aar) ([sha256sums](https://ossci-android.s3.amazonaws.com/executorch/release/executorch-241002/executorch.aar.sha256sums))
480482

481-
2. Rename the downloaded AAR file to `executorch.aar` and move the file to `torchchat/edge/android/torchchat/app/libs/`. You may need to create directory `torchchat/edge/android/torchchat/app/libs/` if it does not exist.
483+
2. Move the downloaded AAR file to `torchchat/edge/android/torchchat/app/libs/`. You may need to create directory `torchchat/edge/android/torchchat/app/libs/` if it does not exist.
482484

483485
3. Push the model and tokenizer file to your device. You can find the model file called `llama3.1.pte` in the current `torchchat` directory and the tokenizer file at `$(python3 torchchat.py where llama3.1)/tokenizer.model` path.
484486
```
@@ -500,7 +502,7 @@ The following assumes you've completed the steps for [Setting up ExecuTorch](#se
500502
and use [this script](https://github.com/pytorch/executorch/blob/main/build/build_android_llm_demo.sh) to build the AAR library.
501503
502504
<p align="center">
503-
<img src="https://pytorch.org/executorch/main/_static/img/android_llama_app.png" width="600" alt="Android app running a LlaMA model">
505+
<img src="https://pytorch.org/executorch/main/_static/img/chat.png" width="600" alt="Android app running a LlaMA model">
504506
</p>
505507
506508

docs/quantization.md

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -121,22 +121,29 @@ python3 torchchat.py generate llama3 --pte-path llama3.pte --prompt "Hello my n
121121
## Experimental TorchAO lowbit kernels
122122

123123
### Use
124-
The quantization scheme a8wxdq dynamically quantizes activations to 8 bits, and quantizes the weights in a groupwise manner with a specified bitwidth and groupsize.
124+
125+
#### linear:a8wxdq
126+
The quantization scheme linear:a8wxdq dynamically quantizes activations to 8 bits, and quantizes the weights in a groupwise manner with a specified bitwidth and groupsize.
125127
It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7), groupsize, and has_weight_zeros (true, false).
126128
The argument has_weight_zeros indicates whether the weights are quantized with scales only (has_weight_zeros: false) or with both scales and zeros (has_weight_zeros: true).
127129
Roughly speaking, {bitwidth: 4, groupsize: 32, has_weight_zeros: false} is similar to GGML's Q4_0 quantization scheme.
128130

129-
You should expect high performance on ARM CPU if bitwidth is 1, 2, 3, 4, or 5 and groupsize is divisible by 16. With other platforms and argument choices, a slow fallback kernel will be used. You will see warnings about this during quantization.
131+
You should expect high performance on ARM CPU if groupsize is divisible by 16. With other platforms and argument choices, a slow fallback kernel will be used. You will see warnings about this during quantization.
132+
133+
#### embedding:wx
134+
The quantization scheme embedding:wx quantizes embeddings in a groupwise manner with the specified bitwidth and groupsize. It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7) and groupsize. Unlike linear:a8wxdq, embedding:wx always quantizes with scales and zeros.
135+
136+
You should expect high performance on ARM CPU if groupsize is divisible by 32. With other platforms and argument choices, a slow fallback kernel will be used. You will see warnings about this during quantization.
130137

131138
### Setup
132-
To use a8wxdq, you must set up the torchao experimental kernels. These will only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.
139+
To use linear:a8wxdq and embedding:wx, you must set up the torchao experimental kernels. These will only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.
133140

134141
From the torchchat root directory, run
135142
```
136143
sh torchchat/utils/scripts/build_torchao_ops.sh
137144
```
138145

139-
This should take about 10 seconds to complete. Once finished, you can use a8wxdq in torchchat.
146+
This should take about 10 seconds to complete.
140147

141148
Note: if you want to use the new kernels in the AOTI and C++ runners, you must pass the flag link_torchao_ops when running the scripts the build the runners.
142149

@@ -156,17 +163,17 @@ Below we show how to use the new kernels. Except for ExecuTorch, you can specif
156163

157164
#### Eager mode
158165
```
159-
OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --prompt "Once upon a time," --num-samples 5
166+
OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --prompt "Once upon a time," --num-samples 5
160167
```
161168

162169
#### torch.compile
163170
```
164-
OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile --prompt "Once upon a time," --num-samples 5
171+
OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile --prompt "Once upon a time," --num-samples 5
165172
```
166173

167174
#### AOTI
168175
```
169-
OMP_NUM_THREADS=6 python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --output-dso llama3_1.so
176+
OMP_NUM_THREADS=6 python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --output-dso llama3_1.so
170177
OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --dso-path llama3_1.so --prompt "Once upon a time," --num-samples 5
171178
```
172179

@@ -178,7 +185,7 @@ OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cac
178185

179186
#### ExecuTorch
180187
```
181-
python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --output-pte llama3_1.pte
188+
python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --output-pte llama3_1.pte
182189
```
183190

184191
Note: only the ExecuTorch C++ runner in torchchat when built using the instructions in the setup can run the exported *.pte file. It will not work with the `python torchchat.py generate` command.

install/.pins/torchao-pin.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
49b1fb61c8b8eceda755579a2fd92c756d822de2
1+
c8f1174a06dcc0102849c8348ca6573bde8847a9

install/install_requirements.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ PYTORCH_NIGHTLY_VERSION=dev20241002
5454
VISION_NIGHTLY_VERSION=dev20241002
5555

5656
# Nightly version for torchtune
57-
TUNE_NIGHTLY_VERSION=dev20240928
57+
TUNE_NIGHTLY_VERSION=dev20241010
5858

5959
# Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
6060
(
@@ -79,7 +79,7 @@ fi
7979
REQUIREMENTS_TO_INSTALL=(
8080
torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
8181
torchvision=="0.20.0.${VISION_NIGHTLY_VERSION}"
82-
torchtune=="0.3.0.${TUNE_NIGHTLY_VERSION}"
82+
torchtune=="0.4.0.${TUNE_NIGHTLY_VERSION}"
8383
)
8484

8585
# Install the requirements. --extra-index-url tells pip to look for package

install/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ tiktoken
1212
# Miscellaneous
1313
snakeviz
1414
sentencepiece
15-
numpy < 2.0
15+
# numpy version range required by GGUF util
16+
numpy >= 1.17, < 2.0
1617
gguf
1718
blobfile
1819
tomli >= 1.1.0 ; python_version < "3.11"

torchchat/cli/builder.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -81,19 +81,16 @@ def __post_init__(self):
8181
if self.aoti_package_path and self.pte_path:
8282
raise RuntimeError("specify either AOTI Package path or PTE path, but not more than one")
8383

84-
if self.checkpoint_path and (self.aoti_package_path or self.pte_path):
85-
print(
86-
"Warning: checkpoint path ignored because an exported AOTI or PTE path specified"
87-
)
88-
if self.checkpoint_dir and (self.aoti_package_path or self.pte_path):
89-
print(
90-
"Warning: checkpoint dir ignored because an exported AOTI or PTE path specified"
91-
)
92-
if self.gguf_path and (self.aoti_package_path or self.pte_path):
93-
print(
94-
"Warning: GGUF path ignored because an exported AOTI or PTE path specified"
95-
)
96-
if not (self.aoti_package_path) and not (self.pte_path):
84+
if self.aoti_package_path or self.pte_path:
85+
ignored_params = [
86+
(self.checkpoint_path, "checkpoint path"),
87+
(self.checkpoint_dir, "checkpoint dir"),
88+
(self.gguf_path, "GGUF path"),
89+
]
90+
for param, param_msg in ignored_params:
91+
if param:
92+
print(f"Warning: {param_msg} ignored because an exported AOTI or PTE path was specified")
93+
else:
9794
self.prefill_possible = True
9895

9996
@classmethod
@@ -453,7 +450,7 @@ def _maybe_init_distributed(
453450
return world_mesh, parallel_dims
454451

455452

456-
def _maybe_parellelize_model(
453+
def _maybe_parallelize_model(
457454
model: nn.Module,
458455
builder_args: BuilderArgs,
459456
world_mesh: DeviceMesh,
@@ -493,7 +490,7 @@ def _load_model(builder_args: BuilderArgs) -> Model:
493490
model = _init_model_on_meta_device(builder_args)
494491
else:
495492
model = _load_model_default(builder_args)
496-
model = _maybe_parellelize_model(model, builder_args, world_mesh, parallel_dims)
493+
model = _maybe_parallelize_model(model, builder_args, world_mesh, parallel_dims)
497494

498495
model = model.to(device=builder_args.device, dtype=builder_args.precision)
499496
return model.eval()

torchchat/cli/download.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010
from pathlib import Path
1111
from typing import Optional
1212

13-
from torchchat.cli.convert_hf_checkpoint import convert_hf_checkpoint, convert_hf_checkpoint_to_tune
13+
from torchchat.cli.convert_hf_checkpoint import (
14+
convert_hf_checkpoint,
15+
convert_hf_checkpoint_to_tune,
16+
)
1417
from torchchat.model_config.model_config import (
1518
load_model_configs,
1619
ModelConfig,
@@ -57,7 +60,6 @@ def _download_hf_snapshot(
5760
snapshot_download(
5861
model_config.distribution_path,
5962
local_dir=artifact_dir,
60-
local_dir_use_symlinks=False,
6163
token=hf_token,
6264
ignore_patterns=ignore_patterns,
6365
)
@@ -77,9 +79,14 @@ def _download_hf_snapshot(
7779
raise e
7880

7981
# Convert the Multimodal Llama model to the torchtune format.
80-
if model_config.name in {"meta-llama/Llama-3.2-11B-Vision-Instruct", "meta-llama/Llama-3.2-11B-Vision"}:
82+
if model_config.name in {
83+
"meta-llama/Llama-3.2-11B-Vision-Instruct",
84+
"meta-llama/Llama-3.2-11B-Vision",
85+
}:
8186
print(f"Converting {model_config.name} to torchtune format...", file=sys.stderr)
82-
convert_hf_checkpoint_to_tune( model_dir=artifact_dir, model_name=model_config.name)
87+
convert_hf_checkpoint_to_tune(
88+
model_dir=artifact_dir, model_name=model_config.name
89+
)
8390

8491
else:
8592
# Convert the model to the torchchat format.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
/build
1+
/build

0 commit comments

Comments
 (0)