Skip to content

Commit 74584d9

Browse files
Update packaging in AOTI path (#896)
* Update AOTI package * Fix test-runner-aot-cuda --------- Co-authored-by: Mengwei Liu <[email protected]>
1 parent f20f5e7 commit 74584d9

File tree

11 files changed

+188
-92
lines changed

11 files changed

+188
-92
lines changed

.ci/scripts/validate.sh

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -133,60 +133,60 @@ function generate_aoti_model_output() {
133133
echo "******************************************"
134134
echo "************** non-quantized *************"
135135
echo "******************************************"
136-
python3 -W ignore torchchat.py export --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path "${MODEL_DIR}/${MODEL_NAME}.so" --device "$TARGET_DEVICE" || exit 1
137-
python3 -W ignore torchchat.py generate --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --dso-path "$MODEL_DIR/${MODEL_NAME}.so" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
136+
python3 -W ignore torchchat.py export --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --output-aoti-package-path "${MODEL_DIR}/${MODEL_NAME}.pt2" --device "$TARGET_DEVICE" || exit 1
137+
python3 -W ignore torchchat.py generate --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --aoti-package-path "$MODEL_DIR/${MODEL_NAME}.pt2" --prompt "$PROMPT" --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
138138
.ci/scripts/check_gibberish "$MODEL_DIR/output_aoti"
139139

140140
echo "******************************************"
141141
echo "******* Emb: channel-wise quantized ******"
142142
echo "******************************************"
143-
python3 -W ignore torchchat.py export --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
144-
python3 -W ignore torchchat.py generate --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
143+
python3 -W ignore torchchat.py export --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --output-aoti-package-path ${MODEL_DIR}/${MODEL_NAME}.pt2 --device "$TARGET_DEVICE" || exit 1
144+
python3 -W ignore torchchat.py generate --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --aoti-package-path ${MODEL_DIR}/${MODEL_NAME}.pt2 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
145145
.ci/scripts/check_gibberish "$MODEL_DIR/output_aoti"
146146

147147
echo "******************************************"
148148
echo "******** Emb: group-wise quantized *******"
149149
echo "******************************************"
150-
python3 -W ignore torchchat.py export --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
151-
python3 -W ignore torchchat.py generate --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
150+
python3 -W ignore torchchat.py export --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --output-aoti-package-path ${MODEL_DIR}/${MODEL_NAME}.pt2 --device "$TARGET_DEVICE" || exit 1
151+
python3 -W ignore torchchat.py generate --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --aoti-package-path ${MODEL_DIR}/${MODEL_NAME}.pt2 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
152152
.ci/scripts/check_gibberish "$MODEL_DIR/output_aoti"
153153

154154
echo "***********************************************"
155155
echo "******* Emb: 4bit channel-wise quantized ******"
156156
echo "***********************************************"
157-
python3 -W ignore torchchat.py export --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
158-
python3 -W ignore torchchat.py generate --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
157+
python3 -W ignore torchchat.py export --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 4, "groupsize": 0, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --output-aoti-package-path ${MODEL_DIR}/${MODEL_NAME}.pt2 --device "$TARGET_DEVICE" || exit 1
158+
python3 -W ignore torchchat.py generate --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --aoti-package-path ${MODEL_DIR}/${MODEL_NAME}.pt2 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
159159
.ci/scripts/check_gibberish "$MODEL_DIR/output_aoti"
160160

161161
echo "***********************************************"
162162
echo "******** Emb: 4bit group-wise quantized *******"
163163
echo "***********************************************"
164-
python3 -W ignore torchchat.py export --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
165-
python3 -W ignore torchchat.py generate --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
164+
python3 -W ignore torchchat.py export --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 4, "groupsize": 8, "packed": "True"}}' --checkpoint-path "$CHECKPOINT_PATH" --output-aoti-package-path ${MODEL_DIR}/${MODEL_NAME}.pt2 --device "$TARGET_DEVICE" || exit 1
165+
python3 -W ignore torchchat.py generate --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --aoti-package-path ${MODEL_DIR}/${MODEL_NAME}.pt2 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
166166
.ci/scripts/check_gibberish "$MODEL_DIR/output_aoti"
167167

168168
if [ "${EXCLUDE_INT8_QUANT:-false}" == false ]; then
169169
echo "******************************************"
170170
echo "******* INT8 channel-wise quantized ******"
171171
echo "******************************************"
172-
python3 -W ignore torchchat.py export --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
173-
python3 -W ignore torchchat.py generate --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
172+
python3 -W ignore torchchat.py export --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path "$CHECKPOINT_PATH" --output-aoti-package-path ${MODEL_DIR}/${MODEL_NAME}.pt2 --device "$TARGET_DEVICE" || exit 1
173+
python3 -W ignore torchchat.py generate --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --aoti-package-path ${MODEL_DIR}/${MODEL_NAME}.pt2 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
174174
.ci/scripts/check_gibberish "$MODEL_DIR/output_aoti"
175175

176176
echo "******************************************"
177177
echo "******** INT8 group-wise quantized *******"
178178
echo "******************************************"
179-
python3 -W ignore torchchat.py export --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
180-
python3 -W ignore torchchat.py generate --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
179+
python3 -W ignore torchchat.py export --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path "$CHECKPOINT_PATH" --output-aoti-package-path ${MODEL_DIR}/${MODEL_NAME}.pt2 --device "$TARGET_DEVICE" || exit 1
180+
python3 -W ignore torchchat.py generate --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --aoti-package-path ${MODEL_DIR}/${MODEL_NAME}.pt2 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
181181
.ci/scripts/check_gibberish "$MODEL_DIR/output_aoti"
182182
fi
183183
echo "******************************************"
184184
echo "******** INT4 group-wise quantized *******"
185185
echo "******************************************"
186186
if [[ "$TARGET_DEVICE" != "cuda" || "$DTYPE" == "bfloat16" ]]; then
187187
# For CUDA, only bfloat16 makes sense for int4 mm kernel
188-
python3 -W ignore torchchat.py export --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" || exit 1
189-
python3 -W ignore torchchat.py generate --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
188+
python3 -W ignore torchchat.py export --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path "$CHECKPOINT_PATH" --output-aoti-package-path ${MODEL_DIR}/${MODEL_NAME}.pt2 --device "$TARGET_DEVICE" || exit 1
189+
python3 -W ignore torchchat.py generate --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --temperature 0 --aoti-package-path ${MODEL_DIR}/${MODEL_NAME}.pt2 --device "$TARGET_DEVICE" > "$MODEL_DIR/output_aoti" || exit 1
190190
.ci/scripts/check_gibberish "$MODEL_DIR/output_aoti"
191191
fi
192192
done
@@ -285,8 +285,8 @@ function eval_model_sanity_check() {
285285
echo "******** INT4 group-wise quantized (AOTI) *******"
286286
echo "*************************************************"
287287
if [ "$DTYPE" != "float16" ]; then
288-
python3 -W ignore torchchat.py export --dtype ${DTYPE} --quant "$QUANT_OPTIONS" --checkpoint-path "$CHECKPOINT_PATH" --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so --dynamic-shapes --device "$TARGET_DEVICE" || exit 1
289-
python3 -W ignore torchchat.py eval --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --dso-path ${MODEL_DIR}/${MODEL_NAME}.so --device "$TARGET_DEVICE" --limit 5 > "$MODEL_DIR/output_eval_aoti" || exit 1
288+
python3 -W ignore torchchat.py export --dtype ${DTYPE} --quant "$QUANT_OPTIONS" --checkpoint-path "$CHECKPOINT_PATH" --output-aoti-package-path ${MODEL_DIR}/${MODEL_NAME}.pt2 --dynamic-shapes --device "$TARGET_DEVICE" || exit 1
289+
python3 -W ignore torchchat.py eval --dtype ${DTYPE} --checkpoint-path "$CHECKPOINT_PATH" --aoti-package-path ${MODEL_DIR}/${MODEL_NAME}.pt2 --device "$TARGET_DEVICE" --limit 5 > "$MODEL_DIR/output_eval_aoti" || exit 1
290290
cat "$MODEL_DIR/output_eval_aoti"
291291
fi;
292292
fi;

.github/workflows/pull.yml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -378,8 +378,8 @@ jobs:
378378
379379
echo "::group::Run inference with quantize file"
380380
if [ $(uname -s) == Darwin ]; then
381-
python3 torchchat.py export --output-dso-path /tmp/model.so --quantize torchchat/quant_config/cuda.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
382-
python3 torchchat.py generate --dso-path /tmp/model.so --checkpoint "./checkpoints/${REPO_NAME}/model.pth"~
381+
python3 torchchat.py export --output-aoti-package-path /tmp/model.pt2 --quantize torchchat/quant_config/cuda.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
382+
python3 torchchat.py generate --aoti-package-path /tmp/model.pt2 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"~
383383
fi
384384
echo "::endgroup::"
385385
@@ -1023,8 +1023,8 @@ jobs:
10231023
10241024
for dtype in fp32 fp16 bf16 fast fast16; do
10251025
echo "Running export + runner with dtype=$dtype"
1026-
python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --dtype $dtype --output-dso-path /tmp/model.so
1027-
./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
1026+
python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --dtype $dtype --output-aoti-package-path /tmp/model.pt2
1027+
./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
10281028
done
10291029
10301030
echo "Tests complete."
@@ -1118,8 +1118,8 @@ jobs:
11181118
python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
11191119
./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
11201120
echo "Export and run AOTI (C++ runner)"
1121-
python torchchat.py export stories110M --output-dso-path ./model.so --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
1122-
./cmake-out/aoti_run ./model.so -z ./tokenizer.model -t 0 -i "${PRMT}"
1121+
python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
1122+
./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}"
11231123
echo "Generate AOTI"
1124-
python torchchat.py generate stories110M --dso-path ./model.so --prompt "${PRMT}"
1124+
python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
11251125
echo "Tests complete."

.github/workflows/runner-cuda-dtype.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,9 @@ jobs:
5656
for DTYPE in bfloat16; do
5757
python torchchat.py generate --dtype ${DTYPE} --checkpoint-path ${MODEL_DIR}/stories15M.pt --temperature 0 --prompt "${PROMPT}" --device cuda
5858
59-
python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-dso-path /tmp/model.so
59+
python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-aoti-package-path /tmp/model.pt2
6060
61-
./cmake-out/aoti_run /tmp/model.so -d CUDA -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
61+
./cmake-out/aoti_run /tmp/model.pt2 -d CUDA -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
6262
6363
done
6464

README.md

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -293,13 +293,18 @@ Use the "Max Response Tokens" slider to limit the maximum number of tokens gener
293293
## Desktop/Server Execution
294294

295295
### AOTI (AOT Inductor)
296-
[AOTI](https://pytorch.org/blog/pytorch2-2/) compiles models before execution for faster inference. The process creates a [DSO](https://en.wikipedia.org/wiki/Shared_library) model (represented by a file with extension `.so`)
297-
that is then loaded for inference. This can be done with both Python and C++ environments.
296+
[AOTI](https://pytorch.org/blog/pytorch2-2/) compiles models before execution
297+
for faster inference. The process creates a zipped PT2 file containing all the
298+
artifacts generated by AOTInductor, and a
299+
[.so](https://en.wikipedia.org/wiki/Shared_library) file with the runnable
300+
contents that is then loaded for inference. This can be done with both Python
301+
and C++ enviroments.
298302

299303
The following example exports and executes the Llama3.1 8B Instruct
300304
model. The first command compiles and performs the actual export.
301-
```
302-
python3 torchchat.py export llama3.1 --output-dso-path exportedModels/llama3.1.so
305+
306+
```bash
307+
python3 torchchat.py export llama3.1 --output-aoti-package-path exportedModels/llama3_1_artifacts.pt2
303308
```
304309

305310
> [!NOTE]
@@ -311,12 +316,11 @@ case visit our [customization guide](docs/model_customization.md).
311316

312317
### Run in a Python Environment
313318

314-
To run in a python environment, use the generate subcommand like before, but include the dso file.
319+
To run in a python enviroment, use the generate subcommand like before, but include the pt2 file.
315320

321+
```bash
322+
python3 torchchat.py generate llama3.1 --aoti-package-path exportedModels/llama3_1_artifacts.pt2 --prompt "Hello my name is"
316323
```
317-
python3 torchchat.py generate llama3.1 --dso-path exportedModels/llama3.1.so --prompt "Hello my name is"
318-
```
319-
**Note:** Depending on which accelerator is used to generate the .dso file, the command may need the device specified: `--device (cuda | cpu)`.
320324

321325

322326
### Run using our C++ Runner
@@ -326,11 +330,10 @@ To run in a C++ enviroment, we need to build the runner binary.
326330
torchchat/utils/scripts/build_native.sh aoti
327331
```
328332

329-
Then run the compiled executable, with the exported DSO from earlier.
333+
Then run the compiled executable, with the pt2.
330334
```bash
331-
cmake-out/aoti_run exportedModels/llama3.1.so -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time"
335+
cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time"
332336
```
333-
**Note:** Depending on which accelerator is used to generate the .dso file, the runner may need the device specified: `-d (CUDA | CPU)`.
334337

335338
## Mobile Execution
336339

runner/run.cpp

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,7 @@ LICENSE file in the root directory of this source tree.
3131
#endif
3232

3333
#ifdef __AOTI_MODEL__
34-
#include <torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
35-
#ifdef USE_CUDA
36-
#include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
37-
#endif
34+
#include <torch/csrc/inductor/aoti_package/model_package_loader.h>
3835
torch::Device aoti_device(torch::kCPU);
3936

4037
#else // __ET_MODEL__
@@ -94,7 +91,7 @@ typedef struct {
9491
RunState state; // buffers for the "wave" of activations in the forward pass
9592

9693
#ifdef __AOTI_MODEL__
97-
torch::inductor::AOTIModelContainerRunner* runner;
94+
torch::inductor::AOTIModelPackageLoader* runner;
9895
#else // __ET_MODEL__
9996
Module* runner;
10097
#endif
@@ -144,16 +141,8 @@ void build_transformer(
144141
malloc_run_state(&t->state, &t->config);
145142

146143
#ifdef __AOTI_MODEL__
147-
#ifdef USE_CUDA
148-
if (aoti_device.type() == torch::kCUDA) {
149-
t->runner = new torch::inductor::AOTIModelContainerRunnerCuda(model_path);
150-
aoti_device = torch::Device(torch::kCUDA);
151-
} else {
152-
#else
153-
{
154-
#endif
155-
t->runner = new torch::inductor::AOTIModelContainerRunnerCpu(model_path);
156-
}
144+
t->runner = new torch::inductor::AOTIModelPackageLoader(model_path);
145+
aoti_device = t->runner->get_metadata()["AOTI_DEVICE_KEY"] == "cpu" ? torch::Device(torch::kCPU) : torch::Device(torch::kCUDA);
157146
#else //__ET_MODEL__
158147
t->runner = new Module(
159148
/* path to PTE model */ model_path,

0 commit comments

Comments
 (0)