15
15
16
16
HPARAMS = keys = ["vocab_size" , "dim" , "multiple_of" , "n_heads" , "n_layers" ]
17
17
18
+
18
19
def parse_args ():
19
- parser = argparse .ArgumentParser (description = 'Upgrade a GPT4All model to the current format' )
20
- parser .add_argument ('gpt4all_model' , help = 'path to gpt4all-lora-quantized.bin' )
21
- parser .add_argument ('tokenizer_model' , help = 'path to LLaMA tokenizer.model file' )
20
+ parser = argparse .ArgumentParser (
21
+ description = "Upgrade a GPT4All model to the current format"
22
+ )
23
+ parser .add_argument ("gpt4all_model" , help = "path to gpt4all-lora-quantized.bin" )
24
+ parser .add_argument ("tokenizer_model" , help = "path to LLaMA tokenizer.model file" )
22
25
return parser .parse_args ()
23
26
27
+
24
28
def read_header (f_in ):
25
29
struct_fmt = "i" * (3 + len (HPARAMS ))
26
30
struct_size = struct .calcsize (struct_fmt )
27
31
buf = f_in .read (struct_size )
28
32
return struct .unpack (struct_fmt , buf )
29
33
34
+
30
35
def write_header (f_out , header ):
31
36
(magic , vocab_size , dim , multiple_of , n_heads , n_layers , rot , ftype ) = header
32
37
33
- if magic != 0x67676d6c :
34
- raise Exception (' Invalid file magic. Must be an old style ggml file.' )
38
+ if magic != 0x67676D6C :
39
+ raise Exception (" Invalid file magic. Must be an old style ggml file." )
35
40
36
41
values = [
37
- 0x67676d66 , # magic: ggml in hex
38
- 1 , # file version
42
+ 0x67676D66 , # magic: ggml in hex
43
+ 1 , # file version
39
44
vocab_size ,
40
45
dim ,
41
46
multiple_of ,
42
47
n_heads ,
43
48
n_layers ,
44
49
rot ,
45
- ftype
50
+ ftype ,
46
51
]
47
52
f_out .write (struct .pack ("i" * len (values ), * values ))
48
53
54
+
49
55
def write_tokens (fout , tokenizer ):
50
56
for i in range (tokenizer .vocab_size ()):
51
57
if tokenizer .is_unknown (i ):
@@ -71,22 +77,25 @@ def write_tokens(fout, tokenizer):
71
77
fout .write (text )
72
78
fout .write (struct .pack ("f" , 0.0 ))
73
79
80
+
74
81
def read_tokens (f_in , tokenizer ):
75
82
for i in range (tokenizer .vocab_size ()):
76
83
len_b = f_in .read (4 )
77
84
(length ,) = struct .unpack ("i" , len_b )
78
85
f_in .read (length )
79
86
87
+
80
88
def copy_all_data (f_out , f_in ):
81
89
while True :
82
90
buf = f_in .read (1024 * 1024 )
83
91
if not buf :
84
92
break
85
93
f_out .write (buf )
86
94
95
+
87
96
def convert_one_file (path_in , tokenizer ):
88
97
path_tmp = f"{ path_in } .tmp"
89
- path_orig = f"{ path_in } .orig"
98
+ path_orig = f"{ path_in } .orig"
90
99
print (f"converting { path_in } " )
91
100
with open (path_in , "rb" ) as f_in , open (path_tmp , "wb" ) as f_out :
92
101
write_header (f_out , read_header (f_in ))
@@ -96,12 +105,14 @@ def convert_one_file(path_in, tokenizer):
96
105
os .rename (path_in , path_orig )
97
106
os .rename (path_tmp , path_in )
98
107
108
+
99
109
def main ():
100
110
args = parse_args ()
101
111
102
112
tokenizer = SentencePieceProcessor (args .tokenizer_model )
103
113
104
114
convert_one_file (args .gpt4all_model , tokenizer )
105
115
116
+
106
117
if __name__ == "__main__" :
107
118
main ()
0 commit comments