Skip to content

Commit 8afc1ef

Browse files
committed
First pass at converting GGMLv3 LLaMA models to GGUF
1 parent c818c40 commit 8afc1ef

File tree

2 files changed

+263
-12
lines changed

2 files changed

+263
-12
lines changed

convert-llama-ggmlv3-to-gguf.py

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
import sys, struct, math, argparse
2+
3+
import numpy as np
4+
5+
import gguf
6+
7+
# Note: Does not support GGML_QKK_64
8+
QK_K = 256
9+
# Items here are (block size, type size)
10+
GGML_QUANT_SIZES = {
11+
gguf.GGMLQuantizationType.F32 : (1, 4),
12+
gguf.GGMLQuantizationType.F16 : (1, 2),
13+
gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
14+
gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
15+
gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
16+
gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
17+
gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
18+
gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
19+
gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
20+
gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
21+
gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
22+
gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
23+
gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
24+
gguf.GGMLQuantizationType.Q8_K : (256, 2 + QK_K + QK_K // 8),
25+
}
26+
27+
class Hyperparameters:
28+
def __init__(self):
29+
self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
30+
31+
def load(self, data, offset):
32+
(
33+
self.n_vocab,
34+
self.n_embd,
35+
self.n_mult,
36+
self.n_head,
37+
self.n_layer,
38+
self.n_rot,
39+
self.ftype,
40+
) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
41+
return 4 * 7
42+
43+
def __str__(self):
44+
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, ftype={self.ftype}>'
45+
46+
class Vocab:
47+
def __init__(self):
48+
self.items = []
49+
50+
def load(self, data, offset, n_vocab):
51+
orig_offset = offset
52+
for _ in range(n_vocab):
53+
itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
54+
assert itemlen < 4096, 'Absurd vocab item length'
55+
offset += 4
56+
vocab = bytes(data[offset:offset + itemlen])
57+
offset += itemlen
58+
score = struct.unpack('<f', data[offset:offset + 4])[0]
59+
offset += 4
60+
self.items.append((vocab, score))
61+
return offset - orig_offset
62+
63+
class Tensor:
64+
def __init__(self):
65+
self.name = None
66+
self.dims = ()
67+
self.dtype = None
68+
self.start_offset = 0
69+
self.len_bytes = 0
70+
71+
def load(self, data, offset):
72+
orig_offset = offset
73+
(n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
74+
assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
75+
assert name_len < 4096, 'Absurd tensor name length'
76+
quant = GGML_QUANT_SIZES.get(dtype)
77+
assert quant is not None, 'Unknown tensor type'
78+
(blksize, tysize) = quant
79+
offset += 12
80+
self.dtype= dtype
81+
self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
82+
offset += 4 * n_dims
83+
self.name = bytes(data[offset:offset + name_len])
84+
offset += name_len
85+
pad = ((offset + 31) & ~31) - offset
86+
offset += pad
87+
n_elems = np.prod(self.dims)
88+
n_bytes = (n_elems * tysize) // blksize
89+
self.start_offset = offset
90+
self.len_bytes = n_bytes
91+
offset += n_bytes
92+
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
93+
return offset - orig_offset
94+
95+
class GGMLV3Model:
96+
def __init__(self):
97+
self.hyperparameters = None
98+
self.vocab = None
99+
self.tensor_map = {}
100+
self.tensors = []
101+
102+
def validate_header(self, data, offset):
103+
if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
104+
raise ValueError('Only GGJTv3 supported')
105+
return 8
106+
107+
def load(self, data, offset):
108+
offset += self.validate_header(data, offset)
109+
hp = Hyperparameters()
110+
offset += hp.load(data, offset)
111+
vocab = Vocab()
112+
offset += vocab.load(data, offset, hp.n_vocab)
113+
tensors = []
114+
tensor_map = {}
115+
while offset < len(data):
116+
tensor = Tensor()
117+
offset += tensor.load(data, offset)
118+
tensor_map[tensor.name] = len(tensors)
119+
tensors.append(tensor)
120+
self.hyperparameters = hp
121+
self.vocab = vocab
122+
self.tensors = tensors
123+
self.tensor_map = tensor_map
124+
return offset
125+
126+
def save_gguf(ggml_model, data, cfg):
127+
hp = ggml_model.hyperparameters
128+
ff_tensor_idx = ggml_model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
129+
assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
130+
ff_tensor = ggml_model.tensors[ff_tensor_idx]
131+
if cfg.gqa == 1:
132+
n_kv_head = hp.n_head
133+
else:
134+
gqa = float(cfg.gqa)
135+
n_kv_head = None
136+
for x in range(1, 256):
137+
if float(hp.n_head) / float(x) == gqa:
138+
n_kv_head = x
139+
assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
140+
print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
141+
nm = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, hp.n_layer)
142+
gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
143+
#gguf_writer.add_name('meep')
144+
#gguf_writer.add_source_hf_repo('merp')
145+
# gguf_writer.add_tensor_data_layout("Meta AI original pth")
146+
gguf_writer.add_context_length(cfg.context_length)
147+
gguf_writer.add_embedding_length(hp.n_embd)
148+
gguf_writer.add_block_count(hp.n_layer)
149+
gguf_writer.add_feed_forward_length(ff_tensor.dims[1])
150+
print('FF dim', ff_tensor.dims[1])
151+
gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
152+
gguf_writer.add_head_count(hp.n_head)
153+
gguf_writer.add_head_count_kv(n_kv_head)
154+
gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
155+
gguf_writer.add_tokenizer_model('llama')
156+
tokens = []
157+
scores = []
158+
print(f'* Adding {hp.n_vocab} vocab item(s)')
159+
toktypes = []
160+
for (tokid, (vbytes, vscore)) in enumerate(ggml_model.vocab.items):
161+
if len(vbytes) > 1 and vbytes[0] == 32:
162+
vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
163+
tt = 1
164+
if len(vbytes) == 0:
165+
tt = 3
166+
elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
167+
hv = hex(vbytes[0])[2:].upper()
168+
vbytes = bytes(f'<0x{hv}>', encoding = 'UTF-8')
169+
tt = 6
170+
toktypes.append(tt)
171+
tokens.append(vbytes)
172+
scores.append(vscore)
173+
gguf_writer.add_token_list(tokens)
174+
gguf_writer.add_token_scores(scores)
175+
gguf_writer.add_token_types(toktypes)
176+
print('* Adding tensors')
177+
for tensor in ggml_model.tensors:
178+
name = str(tensor.name, 'UTF-8')
179+
if name.endswith('.weight'):
180+
name = name[:-7]
181+
suffix = '.weight'
182+
elif name.endswith('.bias'):
183+
name = name[:-5]
184+
suffix = '.bias'
185+
mapped_name = nm.get(name)
186+
assert mapped_name is not None, f'Bad name {name}'
187+
mapped_name += suffix
188+
tempdims = list(tensor.dims[:])
189+
if len(tempdims) > 1:
190+
temp = tempdims[1]
191+
tempdims[1] = tempdims[0]
192+
tempdims[0] = temp
193+
print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
194+
gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
195+
print("gguf: write header")
196+
gguf_writer.write_header_to_file()
197+
print("gguf: write metadata")
198+
gguf_writer.write_kv_data_to_file()
199+
print("gguf: write tensors")
200+
gguf_writer.write_tensors_to_file()
201+
202+
gguf_writer.close()
203+
204+
def handle_args():
205+
parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
206+
parser.add_argument('--input', '-i', help = 'Input GGMLv3 filename')
207+
parser.add_argument('--output', '-o', help ='Output GGUF filename')
208+
parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
209+
parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps (use 1e-5 for LLaMA2)')
210+
parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length')
211+
return parser.parse_args()
212+
213+
def main():
214+
cfg = handle_args()
215+
data = np.memmap(cfg.input, mode = 'r')
216+
model = GGMLV3Model()
217+
offset = model.load(data, 0)
218+
print(model.hyperparameters)
219+
# print(model.vocab.items)
220+
# return
221+
save_gguf(model, data, cfg)
222+
223+
main()

gguf.py

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import numpy as np
66

77
from enum import IntEnum, auto
8-
from typing import Any, IO, List
8+
from typing import Any, IO, List, Optional
99

1010
#
1111
# constants
@@ -325,8 +325,20 @@ def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
325325

326326

327327
class GGMLQuantizationType(IntEnum):
328-
F32 = 0
329-
F16 = 1
328+
F32 = 0
329+
F16 = 1
330+
Q4_0 = 2
331+
Q4_1 = 3
332+
Q5_0 = 6
333+
Q5_1 = 7
334+
Q8_0 = 8
335+
Q8_1 = 9
336+
Q2_K = 10
337+
Q3_K = 11
338+
Q4_K = 12
339+
Q5_K = 13
340+
Q6_K = 14
341+
Q8_K = 15
330342

331343

332344
class GGUFValueType(IntEnum):
@@ -359,7 +371,7 @@ def get_type(val):
359371

360372

361373
class GGUFWriter:
362-
def __init__(self, path: str, arch: str):
374+
def __init__(self, path: str, arch: str, use_temp_file = True):
363375
self.fout = open(path, "wb")
364376
self.arch = arch
365377
self.offset_tensor = 0
@@ -369,6 +381,8 @@ def __init__(self, path: str, arch: str):
369381
self.ti_data = b""
370382
self.ti_data_count = 0
371383
self.add_architecture()
384+
self.use_temp_file = use_temp_file
385+
self.tensors = []
372386

373387
def write_header_to_file(self):
374388
self.fout.write(struct.pack("<I", GGUF_MAGIC))
@@ -476,8 +490,8 @@ def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool =
476490
def ggml_pad(x: int, n: int) -> int:
477491
return ((x + n - 1) // n) * n
478492

479-
def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int):
480-
assert tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
493+
def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
494+
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
481495

482496
encoded_name = name.encode("utf8")
483497
self.ti_data += struct.pack("<I", len(encoded_name))
@@ -486,23 +500,30 @@ def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.
486500
self.ti_data += struct.pack("<I", n_dims)
487501
for i in range(n_dims):
488502
self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
489-
490-
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
503+
if raw_dtype is None:
504+
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
505+
else:
506+
dtype = raw_dtype
491507
self.ti_data += struct.pack("<I", dtype)
492508
self.ti_data += struct.pack("<Q", self.offset_tensor)
493509
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
494510
self.ti_data_count += 1
495511

496-
def add_tensor(self, name: str, tensor: np.ndarray):
497-
if not hasattr(self, "temp_file"):
512+
def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray], raw_dtype: Optional[GGMLQuantizationType] = None):
513+
if self.use_temp_file and not hasattr(self, "temp_file"):
498514
self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
499515
self.temp_file.seek(0)
500516

501-
self.add_tensor_info(name, tensor.shape, tensor.dtype, tensor.nbytes)
517+
self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
518+
519+
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
520+
521+
if not self.use_temp_file:
522+
self.tensors.append((tensor, pad))
523+
return
502524

503525
tensor.tofile(self.temp_file)
504526

505-
pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
506527
if pad != 0:
507528
self.temp_file.write(bytes([0] * pad))
508529

@@ -524,6 +545,13 @@ def write_tensors_to_file(self):
524545
if pad != 0:
525546
self.fout.write(bytes([0] * pad))
526547

548+
if not self.use_temp_file:
549+
for (currtensor, currpad) in self.tensors:
550+
currtensor.tofile(self.fout)
551+
if currpad != 0:
552+
self.fout.write(bytes([0] * currpad))
553+
return
554+
527555
self.temp_file.seek(0)
528556

529557
shutil.copyfileobj(self.temp_file, self.fout)

0 commit comments

Comments
 (0)