Skip to content

Commit 1ff8d80

Browse files
mikekgfbmalfet
authored andcommitted
fixed device support for int4, and unit tests (#212)
* fixed device support for int4, and unit tests * typo * typo * bfloat16 on macos * bfloat16 in mps, redux * typo * remove extraneous use_cuda * device setting in gguf loader
1 parent fa384cb commit 1ff8d80

File tree

7 files changed

+209
-42
lines changed

7 files changed

+209
-42
lines changed
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
name: Run compile tests
2+
3+
on:
4+
pull_request:
5+
push:
6+
branches:
7+
- main
8+
workflow_dispatch:
9+
10+
jobs:
11+
test-cuda:
12+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
13+
with:
14+
runner: linux.g5.4xlarge.nvidia.gpu
15+
gpu-arch-type: cuda
16+
gpu-arch-version: "12.1"
17+
script: |
18+
echo "::group::Print machine info"
19+
uname -a
20+
if [ $(uname -s) == Darwin ]; then
21+
sysctl machdep.cpu.brand_string
22+
sysctl machdep.cpu.core_count
23+
fi
24+
echo "::endgroup::"
25+
26+
echo "::group::Download checkpoints"
27+
# Install requirements
28+
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
29+
pip install -r requirements.txt
30+
echo "::endgroup::"
31+
32+
echo "::group::Download checkpoints"
33+
mkdir -p checkpoints/stories15M
34+
pushd checkpoints/stories15M
35+
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
36+
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
37+
popd
38+
echo "::endgroup::"
39+
40+
echo "::group::Run inference"
41+
export MODEL_PATH=checkpoints/stories15M/stories15M.pt
42+
export MODEL_NAME=stories15M
43+
export MODEL_DIR=/tmp
44+
45+
for DTYPE in bfloat16 float16 float32; do
46+
47+
python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
48+
cat ./output_eager
49+
python generate.py --dtype ${DTYPE} --device cuda --compile --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
50+
cat ./output_compiled
51+
python export.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
52+
python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti
53+
cat ./output_aoti
54+
55+
echo "******************************************"
56+
echo "******* Emb: channel-wise quantized ******"
57+
echo "******************************************"
58+
python generate.py --dtype ${DTYPE} --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
59+
cat ./output_eager
60+
python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
61+
cat ./output_compiled
62+
python export.py --dtype ${DTYPE} --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
63+
python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti
64+
cat ./output_aoti
65+
66+
echo "******************************************"
67+
echo "******** Emb: group-wise quantized *******"
68+
echo "******************************************"
69+
python generate.py --dtype ${DTYPE} --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
70+
cat ./output_eager
71+
python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{" embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
72+
cat ./output_compiled
73+
python export.py --dtype ${DTYPE} --device cuda --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
74+
python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti
75+
cat ./output_aoti
76+
77+
echo "******************************************"
78+
echo "******* INT8 channel-wise quantized ******"
79+
echo "******************************************"
80+
python generate.py --dtype ${DTYPE} --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
81+
cat ./output_eager
82+
python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{" linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
83+
cat ./output_compiled
84+
python export.py --dtype ${DTYPE} --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
85+
python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti
86+
cat ./output_aoti
87+
88+
echo "******************************************"
89+
echo "******** INT8 group-wise quantized *******"
90+
echo "******************************************"
91+
python generate.py --dtype ${DTYPE} --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
92+
cat ./output_eager
93+
python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
94+
cat ./output_compiled
95+
python export.py --dtype ${DTYPE} --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
96+
python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti
97+
cat ./output_aoti
98+
99+
echo "******************************************"
100+
echo "******** INT4 group-wise quantized *******"
101+
echo "******************************************"
102+
python generate.py --dtype ${DTYPE} --device cuda --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
103+
cat ./output_eager
104+
python generate.py --dtype ${DTYPE} --device cuda --compile --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
105+
cat ./output_compiled
106+
python export.py --dtype ${DTYPE} --device cuda --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
107+
python generate.py --dtype ${DTYPE} --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti
108+
cat ./output_aoti
109+
110+
done
111+
112+
echo "tests complete"
113+
echo "******************************************"
114+
echo "::endgroup::"
115+

.github/workflows/compile_t4.yml

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,18 @@ jobs:
9393
python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti
9494
cat ./output_aoti
9595
96+
echo "******************************************"
97+
echo "******** INT4 group-wise quantized *******"
98+
echo "******************************************"
99+
python generate.py --device cuda --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
100+
cat ./output_eager
101+
python generate.py --device cuda --compile --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
102+
cat ./output_compiled
103+
python export.py --device cuda --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
104+
python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti
105+
cat ./output_aoti
106+
96107
echo "tests complete"
97108
echo "******************************************"
98109
echo "::endgroup::"
99-
# echo "********* EAGER vs TORCH.COMPILE *********"
100-
# echo "******************************************"
101-
# diff output_eager output_compiled
102-
# echo "******************************************"
103-
# echo "********* EAGER vs AOT INDUCTOR *********"
104-
# echo "******************************************"
105-
# diff output_eager output_aoti
110+

.github/workflows/test_mps-dtype.yml

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,14 +52,19 @@ jobs:
5252
5353
python generate.py --dtype ${DTYPE} --device mps --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
5454
cat ./output_eager
55-
# python generate.py --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
56-
# cat ./output_eager
57-
# python generate.py --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
58-
# cat ./output_eager
59-
# python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
60-
# cat ./output_eager
61-
# python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
62-
# cat ./output_eager
63-
# PYTORCH_ENABLE_MPS_FALLBACK=1 python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
64-
# cat ./output_eager
55+
56+
python generate.py --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
57+
cat ./output_eager
58+
59+
python generate.py --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
60+
cat ./output_eager
61+
62+
python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
63+
cat ./output_eager
64+
65+
python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
66+
cat ./output_eager
67+
68+
PYTORCH_ENABLE_MPS_FALLBACK=1 python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
69+
cat ./output_eager
6570
done

.github/workflows/test_mps.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,6 @@ jobs:
7171
echo "*** linear int4"
7272
echo "************************************************************"
7373
74-
# PYTORCH_ENABLE_MPS_FALLBACK=1 python generate.py --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
75-
# cat ./output_eager
74+
PYTORCH_ENABLE_MPS_FALLBACK=1 python generate.py --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
75+
cat ./output_eager
7676

build/builder.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,6 @@ def _load_model_not_gguf(
165165
):
166166
assert not builder_args.gguf_path
167167

168-
use_cuda = "cuda" in builder_args.device
169168
with torch.device("meta"):
170169
if builder_args.params_path:
171170
model = Transformer.from_params(builder_args.params_path)

build/gguf_loader.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,11 +177,12 @@ def load_weights(pt_model: torch.nn.Module, weight_map: Dict[str, ReaderTensor],
177177
parent,
178178
_fqn_last(fqn),
179179
WeightOnlyInt4Linear(
180-
in_features, out_features,
180+
"cpu", # TODO: should --device work for gguf load? (yes?!)
181+
in_features,
182+
out_features,
181183
bias=False,
182184
groupsize=Q4_0.groupsize,
183185
inner_k_tiles=inner_k_tiles,
184-
use_cuda=False
185186
)
186187
)
187188
else:

0 commit comments

Comments
 (0)