Skip to content

Commit eec105a

Browse files
committed
blacked gpt4all-to-ggml
1 parent dfa2d70 commit eec105a

File tree

1 file changed

+20
-9
lines changed

1 file changed

+20
-9
lines changed

convert-gpt4all-to-ggml.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,37 +15,43 @@
1515

1616
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
1717

18+
1819
def parse_args():
19-
parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
20-
parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
21-
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
20+
parser = argparse.ArgumentParser(
21+
description="Upgrade a GPT4All model to the current format"
22+
)
23+
parser.add_argument("gpt4all_model", help="path to gpt4all-lora-quantized.bin")
24+
parser.add_argument("tokenizer_model", help="path to LLaMA tokenizer.model file")
2225
return parser.parse_args()
2326

27+
2428
def read_header(f_in):
2529
struct_fmt = "i" * (3 + len(HPARAMS))
2630
struct_size = struct.calcsize(struct_fmt)
2731
buf = f_in.read(struct_size)
2832
return struct.unpack(struct_fmt, buf)
2933

34+
3035
def write_header(f_out, header):
3136
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
3237

33-
if magic != 0x67676d6c:
34-
raise Exception('Invalid file magic. Must be an old style ggml file.')
38+
if magic != 0x67676D6C:
39+
raise Exception("Invalid file magic. Must be an old style ggml file.")
3540

3641
values = [
37-
0x67676d66, # magic: ggml in hex
38-
1, # file version
42+
0x67676D66, # magic: ggml in hex
43+
1, # file version
3944
vocab_size,
4045
dim,
4146
multiple_of,
4247
n_heads,
4348
n_layers,
4449
rot,
45-
ftype
50+
ftype,
4651
]
4752
f_out.write(struct.pack("i" * len(values), *values))
4853

54+
4955
def write_tokens(fout, tokenizer):
5056
for i in range(tokenizer.vocab_size()):
5157
if tokenizer.is_unknown(i):
@@ -71,22 +77,25 @@ def write_tokens(fout, tokenizer):
7177
fout.write(text)
7278
fout.write(struct.pack("f", 0.0))
7379

80+
7481
def read_tokens(f_in, tokenizer):
7582
for i in range(tokenizer.vocab_size()):
7683
len_b = f_in.read(4)
7784
(length,) = struct.unpack("i", len_b)
7885
f_in.read(length)
7986

87+
8088
def copy_all_data(f_out, f_in):
8189
while True:
8290
buf = f_in.read(1024 * 1024)
8391
if not buf:
8492
break
8593
f_out.write(buf)
8694

95+
8796
def convert_one_file(path_in, tokenizer):
8897
path_tmp = f"{path_in}.tmp"
89-
path_orig= f"{path_in}.orig"
98+
path_orig = f"{path_in}.orig"
9099
print(f"converting {path_in}")
91100
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
92101
write_header(f_out, read_header(f_in))
@@ -96,12 +105,14 @@ def convert_one_file(path_in, tokenizer):
96105
os.rename(path_in, path_orig)
97106
os.rename(path_tmp, path_in)
98107

108+
99109
def main():
100110
args = parse_args()
101111

102112
tokenizer = SentencePieceProcessor(args.tokenizer_model)
103113

104114
convert_one_file(args.gpt4all_model, tokenizer)
105115

116+
106117
if __name__ == "__main__":
107118
main()

0 commit comments

Comments
 (0)