Skip to content

Commit ee0c40d

Browse files
committed
Introduce GGML migration tool for new file format
If you deleted your old Meta LLaMA .pth files, then the migrate-ggml-2023-03-30-pr613.py script will allow you to convert your old ggml files into the new mmap()'able format. See #613
1 parent 6f23ba5 commit ee0c40d

File tree

3 files changed

+326
-14
lines changed

3 files changed

+326
-14
lines changed

convert-pth-to-ggml.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Convert a LLaMA model checkpoint to a ggml compatible file
1+
# Convert a LLaMA model checkpoint to a ggjt compatible file
22
#
33
# Load the model using Torch
44
# Iterate over all variables and write them to a binary file.
@@ -52,8 +52,8 @@
5252
}
5353

5454
GGML_TYPE_SIZE = {
55-
GGML_TYPE_Q4_0: 4 + QK/2,
56-
GGML_TYPE_Q4_1: 4*2 + QK/2,
55+
GGML_TYPE_Q4_0: 4 + QK//2,
56+
GGML_TYPE_Q4_1: 4*2 + QK//2,
5757
GGML_TYPE_I8: 1,
5858
GGML_TYPE_I16: 2,
5959
GGML_TYPE_I32: 4,
@@ -245,11 +245,9 @@ def main():
245245
fname_model = f"{dir_model}/consolidated.00.pth"
246246
fname_out = f"{dir_model}/ggml-vocab.bin"
247247
print(f"Extracting only the vocab from '{fname_model}'\n")
248-
model = torch.load(fname_model, map_location="cpu")
249248
with open(fname_out, "wb") as fout:
250249
write_header(fout, hparams, ftype)
251250
write_tokens(fout, tokenizer)
252-
del model
253251
print(f"Done. Output file: {fname_out}\n")
254252
return
255253

llama.cpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -347,14 +347,15 @@ static void munmap_file(void * addr, size_t length) {
347347
#endif
348348
}
349349

350-
static bool report_bad_magic(const char *path) {
350+
static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
351351
fprintf(stderr,
352-
"%s: invalid model file (bad magic)\n"
353-
"you most likely need to regenerate your ggml files\n"
354-
"the benefit is you'll get 10-100x faster load times\n"
355-
"see https://github.com/ggerganov/llama.cpp/issues/91\n"
356-
"use convert-pth-to-ggml.py on your llama model files\n",
357-
path);
352+
"%s: invalid model file (bad magic [got %#x want %#x])\n"
353+
"\tyou most likely need to regenerate your ggml files\n"
354+
"\tthe benefit is you'll get 10-100x faster load times\n"
355+
"\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
356+
"\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
357+
"\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
358+
path, got, want);
358359
return false;
359360
}
360361

@@ -397,7 +398,7 @@ static bool llama_model_load(
397398
return false;
398399
}
399400
if (magic != LLAMA_FILE_MAGIC) {
400-
return report_bad_magic(fname.c_str());
401+
return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
401402
}
402403

403404
uint32_t format_version;
@@ -1312,7 +1313,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
13121313
return false;
13131314
}
13141315
if (magic != LLAMA_FILE_MAGIC) {
1315-
return report_bad_magic(fname_inp.c_str());
1316+
return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
13161317
}
13171318

13181319
fout.write((char *) &magic, sizeof(magic));

migrate-ggml-2023-03-30-pr613.py

Lines changed: 313 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
2+
#
3+
# We caused a breaking change to the file format on 2023-03-30 in:
4+
# https://github.com/ggerganov/llama.cpp/pull/613
5+
#
6+
# (1) If you still have the Meta LLaMA .pth files, then close this
7+
# file now; you can just run `convert-pth-to-ggml.py` again to
8+
# migrate to the new format. The tool is easier to use too. It
9+
# isn't necessary anymore to manage split output files because
10+
# the new format always combines things into a single file.
11+
#
12+
# (2) If you deleted the Meta LLaMA .pth files due to save on disk
13+
# space, then this tool is intended to help you. Please check
14+
# out the instructions below.
15+
#
16+
# USAGE
17+
#
18+
# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
19+
#
20+
# PREREQUISITES
21+
#
22+
# pip install numpy
23+
# cd llama.cpp
24+
# make -j4
25+
#
26+
# EXAMPLE (7B MODEL)
27+
#
28+
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
29+
# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
30+
#
31+
# # check that it works
32+
# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
33+
#
34+
# # you can delete the old files
35+
# rm -f models/7B/ggml-model-f16.bin
36+
# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
37+
#
38+
# EXAMPLE (13B MODEL)
39+
#
40+
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
41+
# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
42+
#
43+
# # check that it works
44+
# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
45+
#
46+
# # you can delete the old files
47+
# rm -f models/13B/ggml-model-f16.bin*
48+
# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
49+
#
50+
51+
import argparse
52+
import os
53+
import sys
54+
import json
55+
import struct
56+
import numpy as np
57+
58+
QK = 32
59+
60+
GGML_TYPE_Q4_0 = 0
61+
GGML_TYPE_Q4_1 = 1
62+
GGML_TYPE_I8 = 2
63+
GGML_TYPE_I16 = 3
64+
GGML_TYPE_I32 = 4
65+
GGML_TYPE_F16 = 5
66+
GGML_TYPE_F32 = 6
67+
68+
WTYPE_NAMES = {
69+
0: "F32",
70+
1: "F16",
71+
2: "Q4_0",
72+
3: "Q4_1",
73+
}
74+
75+
WTYPES = {
76+
0: GGML_TYPE_F32,
77+
1: GGML_TYPE_F16,
78+
2: GGML_TYPE_Q4_0,
79+
3: GGML_TYPE_Q4_1,
80+
}
81+
82+
GGML_BLCK_SIZE = {
83+
GGML_TYPE_Q4_0: QK,
84+
GGML_TYPE_Q4_1: QK,
85+
GGML_TYPE_I8: 1,
86+
GGML_TYPE_I16: 1,
87+
GGML_TYPE_I32: 1,
88+
GGML_TYPE_F16: 1,
89+
GGML_TYPE_F32: 1,
90+
}
91+
92+
GGML_TYPE_SIZE = {
93+
GGML_TYPE_Q4_0: 4 + QK//2,
94+
GGML_TYPE_Q4_1: 4*2 + QK//2,
95+
GGML_TYPE_I8: 1,
96+
GGML_TYPE_I16: 2,
97+
GGML_TYPE_I32: 4,
98+
GGML_TYPE_F16: 2,
99+
GGML_TYPE_F32: 4,
100+
}
101+
102+
HPARAMS = [
103+
'magic', # int32
104+
'version', # int32
105+
'n_vocab', # int32
106+
'n_embd', # int32
107+
'n_mult', # int32
108+
'n_head', # int32
109+
'n_layer', # int32
110+
'n_rot', # int32
111+
'f16', # int32
112+
]
113+
114+
def read_hparams(fin):
115+
struct_fmt = "i" * len(HPARAMS)
116+
struct_size = struct.calcsize(struct_fmt)
117+
buf = fin.read(struct_size)
118+
ints = struct.unpack(struct_fmt, buf)
119+
hparams = dict(zip(HPARAMS, ints))
120+
return hparams
121+
122+
def write_hparams(fout, hparams):
123+
struct_fmt = "i" * len(HPARAMS)
124+
struct_size = struct.calcsize(struct_fmt)
125+
ints = [hparams[h] for h in HPARAMS]
126+
fout.write(struct.pack(struct_fmt, *ints))
127+
128+
def read_tokens(fin, hparams):
129+
tokens = []
130+
for i in range(hparams['n_vocab']):
131+
len_b = fin.read(4)
132+
(length,) = struct.unpack("i", len_b)
133+
word = fin.read(length)
134+
score_b = fin.read(4)
135+
(score,) = struct.unpack("f", score_b)
136+
tokens.append((word, score))
137+
return tokens
138+
139+
def write_tokens(fout, tokens):
140+
for word, score in tokens:
141+
fout.write(struct.pack("i", len(word)))
142+
fout.write(word)
143+
fout.write(struct.pack("f", score))
144+
145+
def ggml_nelements(shape):
146+
r = 1
147+
for i in shape:
148+
r *= i
149+
return r
150+
151+
def ggml_nbytes(shape, ftype):
152+
x = ggml_nelements(shape)
153+
t = WTYPES[ftype]
154+
x *= GGML_TYPE_SIZE[t]
155+
x //= GGML_BLCK_SIZE[t]
156+
return x
157+
158+
def copy_tensors(fin, fout, part_id, n_parts):
159+
while True:
160+
161+
b = fin.read(4)
162+
if not b: break
163+
(n_dims,) = struct.unpack("i", b)
164+
b = fin.read(4)
165+
(length,) = struct.unpack("i", b)
166+
b = fin.read(4)
167+
(ftype,) = struct.unpack("i", b)
168+
169+
assert n_dims in (1, 2)
170+
171+
partshape = list(range(n_dims))
172+
for i in range(n_dims):
173+
b = fin.read(4)
174+
partshape[i] = struct.unpack("i", b)[0]
175+
partshape = list(reversed(partshape))
176+
177+
name = fin.read(length)
178+
data = fin.read(ggml_nbytes(partshape, ftype))
179+
180+
blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
181+
type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
182+
183+
print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
184+
185+
# determine dimension along which multipart tensor is sharded
186+
#
187+
# split_dim 0 regex:
188+
# - output.*
189+
# - layers.*.attention.wq.weight
190+
# - layers.*.attention.wk.weight
191+
# - layers.*.attention.wv.weight
192+
# - layers.*.feed_forward.w1.weight
193+
# - layers.*.feed_forward.w3.weight
194+
#
195+
# split_dim 1 regex:
196+
# - tok_embeddings.*
197+
# - layers.*.attention.wo.weight
198+
# - layers.*.feed_forward.w2.weight
199+
#
200+
if n_dims > 1:
201+
split_dim = 1
202+
if b"tok_embeddings" in name:
203+
split_dim = 1
204+
elif b"layers" in name:
205+
if b"attention.wo.weight" in name:
206+
split_dim = 1
207+
elif b"feed_forward.w2.weight" in name:
208+
split_dim = 1
209+
else:
210+
split_dim = 0
211+
elif b"output" in name:
212+
split_dim = 0
213+
214+
# output tensor header
215+
fullshape = list(partshape)
216+
if n_dims > 1:
217+
fullshape[split_dim] *= n_parts
218+
fout.write(struct.pack("iii", n_dims, len(name), ftype))
219+
for dim in reversed(fullshape):
220+
fout.write(struct.pack("i", dim))
221+
fout.write(name)
222+
223+
# ensure tensor data is aligned
224+
tensor_data_offset = fout.tell()
225+
while tensor_data_offset % QK != 0:
226+
fout.write(struct.pack("B", 0))
227+
tensor_data_offset += 1
228+
229+
# output unified mappable tensor data
230+
if n_dims == 1 or n_parts == 1:
231+
# copy tensor which we thankfully received in one piece
232+
if part_id == 0:
233+
fout.write(data)
234+
elif split_dim == 0:
235+
# reassemble multifile tensor containing some of the rows
236+
rows_per_chunk = partshape[0]
237+
current_row = part_id * rows_per_chunk
238+
bytes_per_row = fullshape[1] // blck_size * type_size
239+
offset = current_row * bytes_per_row
240+
fout.seek(tensor_data_offset + offset)
241+
fout.write(data)
242+
elif split_dim == 1:
243+
# reassemble multifile tensor containing some of the cols
244+
cols_per_chunk = partshape[1]
245+
current_col = part_id * cols_per_chunk
246+
bpr = partshape[1] // blck_size * type_size
247+
bytes_per_row = fullshape[1] // blck_size * type_size
248+
offset_current_col = current_col // blck_size * type_size
249+
for row in range(partshape[0]):
250+
offset_row = row * bytes_per_row
251+
offset = offset_row + offset_current_col
252+
fout.seek(tensor_data_offset + offset)
253+
fout.write(data[row * bpr:row * bpr + bpr])
254+
255+
# advance file position to next tensor
256+
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
257+
258+
def parse_args():
259+
parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
260+
parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
261+
parser.add_argument('fout_path', help='your new ggjt file name')
262+
return parser.parse_args()
263+
264+
def main():
265+
args = parse_args()
266+
assert args.fin_path
267+
assert args.fout_path
268+
assert args.fin_path != args.fout_path
269+
270+
with open(args.fin_path, "rb") as fin:
271+
hparams = read_hparams(fin)
272+
tokens = read_tokens(fin, hparams)
273+
274+
if hparams['magic'] == 0x67676a74: # ggjt
275+
print("%s: input ggml has already been converted to 'ggjt' magic\n" %
276+
(args.fin_path))
277+
sys.exit(1)
278+
279+
if hparams['magic'] != 0x67676d66: # ggmf
280+
print("%s: input ggml file doesn't have expected 'ggmf' magic: %#x\n" %
281+
(args.fin_path, hparams['magic']))
282+
sys.exit(1)
283+
284+
hparams['magic'] = 0x67676a74 # ggjt
285+
286+
# count number of multipart files by convention
287+
n_parts = 1
288+
while True:
289+
if os.path.exists("%s.%d" % (args.fin_path, n_parts)):
290+
n_parts += 1
291+
else:
292+
break
293+
294+
# we output a single file for ggml
295+
with open(args.fout_path, "wb") as fout:
296+
write_hparams(fout, hparams)
297+
write_tokens(fout, tokens)
298+
offset_of_tensors = fout.tell()
299+
# the tensors we load could be split across multiple files
300+
for part_id in range(n_parts):
301+
fout.seek(offset_of_tensors)
302+
print(f"Processing part {part_id+1} of {n_parts}\n")
303+
fin_path = args.fin_path
304+
if part_id > 0:
305+
fin_path += ".%d" % (part_id)
306+
with open(fin_path, "rb") as fin:
307+
read_tokens(fin, read_hparams(fin))
308+
copy_tensors(fin, fout, part_id, n_parts)
309+
310+
print(f"Done. Output file: {args.fout_path}\n")
311+
312+
if __name__ == "__main__":
313+
main()

0 commit comments

Comments
 (0)