Skip to content

Commit 68b53f3

Browse files
mikekgfbmalfet
authored andcommitted
runner-aoti on cuda (#531)
* runner-aoti on cuda * transfer results back to CPU * transfer results back to CPU * runner-aoti on cuda
1 parent 3fd0947 commit 68b53f3

File tree

2 files changed

+101
-1
lines changed

2 files changed

+101
-1
lines changed
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
name: Run runner-aoti CUDA tests
2+
3+
on:
4+
pull_request:
5+
push:
6+
branches:
7+
- main
8+
workflow_dispatch:
9+
10+
jobs:
11+
test-cuda:
12+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
13+
with:
14+
runner: linux.g5.4xlarge.nvidia.gpu
15+
gpu-arch-type: cuda
16+
gpu-arch-version: "12.1"
17+
script: |
18+
echo "::group::Print machine info"
19+
uname -a
20+
echo "::endgroup::"
21+
22+
echo "::group::Install newer objcopy that supports --set-section-alignment"
23+
yum install -y devtoolset-10-binutils
24+
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
25+
echo "::endgroup::"
26+
27+
28+
echo "::group::Download checkpoints"
29+
# Install requirements
30+
pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
31+
pip3 install -r requirements.txt
32+
pip3 list
33+
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
34+
echo "::endgroup::"
35+
36+
echo "::group::Download checkpoints"
37+
mkdir -p checkpoints/stories15M
38+
pushd checkpoints/stories15M
39+
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
40+
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
41+
popd
42+
echo "::endgroup::"
43+
44+
echo "::group::Run inference"
45+
export MODEL_PATH=checkpoints/stories15M/stories15M.pt
46+
export MODEL_NAME=stories15M
47+
export MODEL_DIR=/tmp
48+
49+
- name: Install dependencies
50+
run: |
51+
pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
52+
pip3 install -r requirements.txt
53+
pip3 list
54+
55+
bash scripts/build_native.sh aoti
56+
57+
- name: Download checkpoint
58+
run: |
59+
mkdir -p checkpoints/stories15M
60+
pushd checkpoints/stories15M
61+
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
62+
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
63+
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
64+
popd
65+
- name: Run inference
66+
run: |
67+
set -eou pipefail
68+
69+
70+
export MODEL_DIR=${PWD}/checkpoints/stories15M
71+
export PROMPT="Once upon a time in a land far away"
72+
73+
for DTYPE in bfloat16; do
74+
python torchchat.py generate --dtype ${DTYPE} --checkpoint-path ${MODEL_DIR}/stories15M.pt --temperature 0 --prompt "${PROMPT}" --device cuda
75+
76+
python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-dso-path /tmp/model.so
77+
78+
./cmake-out/aoti_run /tmp/model.so -z ${MODEL_DIR}/tokenizer.bin -i "${PROMPT}"
79+
80+
echo "**********************************************"
81+
echo "******** INT4 HQQ group-wise quantized *******"
82+
echo "**********************************************"
83+
python generate.py --dtype ${DTYPE} --device cuda --quant '{"linear:hqq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
84+
cat ./output_eager
85+
python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"linear:hqq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
86+
cat ./output_compiled
87+
python export.py --dtype ${DTYPE} --device cuda --quant '{"linear:hqq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
88+
python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti
89+
cat ./output_aoti
90+
91+
done
92+
93+
echo "tests complete"
94+
echo "******************************************"
95+
echo "::endgroup::"
96+

runner/run.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323

2424
#ifdef __AOTI_MODEL__
2525
#include <torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h>
26+
torch::Device cpu_device(torch::kCPU);
27+
2628
#else // __ET_MODEL__
2729
#include <executorch/extension/module/module.h>
2830
#include <executorch/extension/runner_util/managed_tensor.h>
@@ -167,7 +169,9 @@ float* forward(Transformer* transformer, int token, int pos) {
167169
std::vector<torch::Tensor> inputs{token_tensor, pos_tensor};
168170

169171
torch::Tensor result =
170-
transformer->runner->run(inputs)[0].to(torch::dtype(torch::kFloat32));
172+
transformer->runner->run(inputs)[0]
173+
.to(torch::dtype(torch::kFloat32))
174+
.to(cpu_device);
171175
auto logits = result[0].data_ptr();
172176

173177
#else // __ET_MODEL__

0 commit comments

Comments
 (0)