Skip to content

Commit f17d2dd

Browse files
metascroymalfet
authored andcommitted
Fix gguf ci (#199)
* load quantized gguf * add comments * remove AOTI * remove ubunut
1 parent da28543 commit f17d2dd

File tree

5 files changed

+85
-80
lines changed

5 files changed

+85
-80
lines changed

.github/workflows/compile-gguf.yml

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
run-tinystories:
1212
strategy:
1313
matrix:
14-
runner: [ubuntu-latest, macos-14]
14+
runner: [macos-14]
1515
runs-on: ${{matrix.runner}}
1616
steps:
1717
- name: Checkout repo
@@ -40,41 +40,47 @@ jobs:
4040
wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_0.gguf?download=true"
4141
wget -O ${TOKENIZER_PATH} https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
4242
- name: Run inference
43-
run: |
43+
run: |
4444
export GGUF_PATH=gguf_files/llama-2-7b.Q4_0.gguf
4545
export TOKENIZER_PATH=gguf_files/tokenizer.model
4646
export MODEL_NAME=llama-2-7b.Q4_0.gguf
4747
export MODEL_DIR=/tmp
48-
python generate.py --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 > ./output_eager
48+
49+
echo "******************************************"
50+
echo "******* Embed: not quantized *************"
51+
echo "******************************************"
52+
53+
echo "Running eager"
54+
python generate.py --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_eager
4955
cat ./output_eager
50-
python generate.py --compile --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 > ./output_compiled
56+
57+
echo "Running compiled"
58+
python generate.py --compile --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_compiled
5159
cat ./output_compiled
52-
python export.py --gguf-path ${GGUF_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
53-
python generate.py --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti
54-
cat ./output_aoti
5560
5661
echo "******************************************"
5762
echo "******* Emb: channel-wise quantized ******"
5863
echo "******************************************"
59-
python generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 > ./output_eager
64+
65+
echo "Running eager"
66+
python generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_eager
6067
cat ./output_eager
61-
python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 > ./output_compiled
68+
69+
echo "Running compiled"
70+
python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_compiled
6271
cat ./output_compiled
63-
python export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
64-
python generate.py --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti
65-
cat ./output_aoti
6672
6773
echo "******************************************"
6874
echo "******** Emb: group-wise quantized *******"
6975
echo "******************************************"
70-
python generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 > ./output_eager
76+
77+
echo "Running eager"
78+
python generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_eager
7179
cat ./output_eager
72-
python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 > ./output_compiled
80+
81+
echo "Running compiled"
82+
python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_compiled
7383
cat ./output_compiled
74-
python export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
75-
python generate.py --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti
76-
cat ./output_aoti
7784
7885
echo "tests complete"
7986
echo "******************************************"
80-

build/builder.py

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def __post_init__(self):
4747
(self.pte_path and Path(self.pte_path).is_file())
4848
):
4949
raise RuntimeError("need to specified a valid checkpoint path, checkpoint dir, gguf path, DSO path, or PTE path")
50-
50+
5151
if (self.dso_path and self.pte_path):
5252
raise RuntimeError("specify either DSO path or PTE path, but not both")
5353

@@ -58,7 +58,7 @@ def __post_init__(self):
5858
if (self.gguf_path and (self.dso_path or self.pte_path)):
5959
print("Warning: GGUF path ignored because an exported DSO or PTE path specified")
6060

61-
61+
6262
@classmethod
6363
def from_args(cls, args): # -> BuilderArgs:
6464
return cls(
@@ -79,14 +79,14 @@ def from_args(cls, args): # -> BuilderArgs:
7979
def from_speculative_args(cls, args): # -> BuilderArgs:
8080
speculative_builder_args = BuilderArgs.from_args(args)
8181
# let's limit multi-checkpoint to checker
82-
speculative_builder_args.checkpoint_dir = None
82+
speculative_builder_args.checkpoint_dir = None
8383
speculative_builder_args.checkpoint_path = args.draft_checkpoint_path
8484
speculative_builder_args.gguf_path = None
8585
speculative_builder_args.dso_path = None
8686
speculative_builder_args.pte_path = None
8787
return speculative_builder_args
8888

89-
89+
9090
@dataclass
9191
class TokenizerArgs:
9292
tokenizer_path: Optional[Union[Path, str]] = None
@@ -97,7 +97,7 @@ class TokenizerArgs:
9797
def from_args(cls, args): # -> TokenizerArgs:
9898
is_SentencePiece = True
9999
is_TikToken = False
100-
100+
101101
if args.tokenizer_path:
102102
tokenizer_path = args.tokenizer_path
103103
elif args.checkpoint_path:
@@ -106,7 +106,7 @@ def from_args(cls, args): # -> TokenizerArgs:
106106
tokenizer_path = args.checkpoint_dir / "tokenizer.model"
107107
else:
108108
raise RuntimeError(f"cannot find tokenizer model")
109-
109+
110110
if not tokenizer_path.is_file():
111111
raise RuntimeError(f"did not find tokenizer at {tokenizer_path}")
112112

@@ -127,7 +127,7 @@ def _initialize_tokenizer(tokenizer_args: TokenizerArgs):
127127
raise RuntimeError("TikToken not implemented yet!")
128128
else:
129129
raise RuntimeError("must specify a valid tokenizer in TokenizerArgs")
130-
130+
131131

132132
def device_sync(device):
133133
if "cuda" in device:
@@ -147,17 +147,30 @@ def device_sync(device):
147147
wd = Path(__file__).parent.parent.resolve()
148148
sys.path.append(str(wd))
149149

150-
def _load_model(
150+
def _load_model(builder_args):
151+
if builder_args.gguf_path:
152+
model = Transformer.from_gguf(builder_args.gguf_path)
153+
154+
# TODO: to take advantage of mmap, maybe we write converted gguf to file
155+
# and read back in?
156+
# TODO: should we add check that builder_args.precision is aligned with quant scheme, e.g., bfloat16
157+
# is needed for int4
158+
model = model.to(device=builder_args.device, dtype=builder_args.precision)
159+
return model.eval()
160+
else:
161+
return _load_model_not_gguf(builder_args)
162+
163+
def _load_model_not_gguf(
151164
builder_args
152165
):
166+
assert not builder_args.gguf_path
167+
153168
use_cuda = "cuda" in builder_args.device
154169
with torch.device("meta"):
155170
if builder_args.params_path:
156171
model = Transformer.from_params(builder_args.params_path)
157172
elif builder_args.params_table:
158173
model = Transformer.from_table(builder_args.params_path)
159-
elif builder_args.gguf_path:
160-
model = Transformer.from_gguf(builder_args.gguf_path)
161174
else:
162175
model = Transformer.from_name(builder_args.checkpoint_path.parent.name)
163176

@@ -176,7 +189,7 @@ def _load_model(
176189
mmap=True,
177190
)
178191
)
179-
192+
180193
checkpoint = {}
181194
for key in cps[0].keys():
182195
if not torch.allclose(cps[0][key], cps[1][key]):
@@ -210,7 +223,7 @@ def _initialize_model(
210223
quantize,
211224
):
212225
print("Loading model ...")
213-
t0 = time.time()
226+
t0 = time.time()
214227
model_ = _load_model(
215228
builder_args
216229
)
@@ -261,5 +274,3 @@ def _initialize_model(
261274
model.to(dtype=builder_args.precision)
262275

263276
return model
264-
265-

build/gguf_loader.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,7 @@
1313
from pathlib import Path
1414
from typing import Any, Mapping, Dict
1515
import logging
16-
from quantize import (
17-
WeightOnlyInt4Linear, pack_scales_and_zeros, group_dequantize_tensor_from_qparams
18-
)
16+
from quantize import WeightOnlyInt4Linear, pack_scales_and_zeros, group_dequantize_tensor_from_qparams
1917
from build.gguf_util import F16, F32, Q4_0, Q6_K
2018
import gguf
2119

generate.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def from_args(cls, args): # -> GeneratorArgs:
4747
speculate_k = args.speculate_k,
4848
)
4949

50-
50+
5151
def device_sync(device):
5252
if "cuda" in device:
5353
torch.cuda.synchronize(device)
@@ -305,7 +305,7 @@ def encode_tokens(tokenizer, string, bos=True, device="cuda"):
305305
def _main(
306306
builder_args: BuilderArgs,
307307
speculative_builder_args: BuilderArgs,
308-
tokenizer_args: TokenizerArgs,
308+
tokenizer_args: TokenizerArgs,
309309
prompt: str = "Hello, my name is",
310310
chat_mode: bool = False,
311311
num_samples: int = 5,
@@ -332,25 +332,25 @@ def _main(
332332
print(f"Using device={builder_args.device}")
333333
set_precision(builder_args.precision)
334334
is_speculative = speculative_builder_args.checkpoint_path is not None
335-
335+
336336
is_chat = "chat" in str(builder_args.checkpoint_path)
337337
if is_chat:
338338
raise RuntimeError("need to stop filename based kludgery, at a minimum need to look at all pathnames. yuck!")
339-
339+
340340
tokenizer = _initialize_tokenizer(tokenizer_args)
341-
341+
342342
builder_args.setup_caches = False
343343
model = _initialize_model(
344344
builder_args,
345345
quantize
346346
)
347-
347+
348348
# will add a version of _initialize_model in future
349349
# (need additional args)
350350
if is_speculative:
351351
from builder import _load_model
352352
speculative_builder_args = builder_args
353-
353+
354354
draft_model = _load_model(
355355
speculative_builder_args,
356356
)
@@ -478,13 +478,13 @@ def callback(x):
478478
)
479479
print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
480480

481-
481+
482482
def main(args):
483483
builder_args = BuilderArgs.from_args(args)
484484
speculative_builder_args = BuilderArgs.from_speculative_args(args)
485485
tokenizer_args = TokenizerArgs.from_args(args)
486486
generator_args = GeneratorArgs.from_args(args)
487-
487+
488488
_main(
489489
builder_args,
490490
speculative_builder_args,

0 commit comments

Comments
 (0)