10
10
11
11
HPARAMS = keys = ["vocab_size" , "dim" , "multiple_of" , "n_heads" , "n_layers" ]
12
12
13
+
13
14
def parse_args ():
14
- parser = argparse .ArgumentParser (description = 'Upgrade old ggml model files to the current format' )
15
- parser .add_argument ('dir_model' , help = 'directory containing ggml .bin files' )
16
- parser .add_argument ('tokenizer_model' , help = 'path to LLaMA tokenizer.model file' )
15
+ parser = argparse .ArgumentParser (
16
+ description = "Upgrade old ggml model files to the current format"
17
+ )
18
+ parser .add_argument ("dir_model" , help = "directory containing ggml .bin files" )
19
+ parser .add_argument ("tokenizer_model" , help = "path to LLaMA tokenizer.model file" )
17
20
return parser .parse_args ()
18
21
22
+
19
23
def read_header (f_in ):
20
24
struct_fmt = "i" * (3 + len (HPARAMS ))
21
25
struct_size = struct .calcsize (struct_fmt )
22
26
buf = f_in .read (struct_size )
23
27
return struct .unpack (struct_fmt , buf )
24
28
29
+
25
30
def write_header (f_out , header ):
26
31
(magic , vocab_size , dim , multiple_of , n_heads , n_layers , rot , ftype ) = header
27
32
28
- if magic != 0x67676d6c :
29
- raise Exception (' Invalid file magic. Must be an old style ggml file.' )
33
+ if magic != 0x67676D6C :
34
+ raise Exception (" Invalid file magic. Must be an old style ggml file." )
30
35
31
36
values = [
32
- 0x67676d66 , # magic: ggml in hex
33
- 1 , # file version
37
+ 0x67676D66 , # magic: ggml in hex
38
+ 1 , # file version
34
39
vocab_size ,
35
40
dim ,
36
41
multiple_of ,
37
42
n_heads ,
38
43
n_layers ,
39
44
rot ,
40
- ftype
45
+ ftype ,
41
46
]
42
47
f_out .write (struct .pack ("i" * len (values ), * values ))
43
48
49
+
44
50
def write_tokens (fout , tokenizer ):
45
51
for i in range (tokenizer .vocab_size ()):
46
52
if tokenizer .is_unknown (i ):
@@ -60,22 +66,25 @@ def write_tokens(fout, tokenizer):
60
66
fout .write (text )
61
67
fout .write (struct .pack ("f" , tokenizer .get_score (i )))
62
68
69
+
63
70
def read_tokens (f_in , tokenizer ):
64
71
for i in range (tokenizer .vocab_size ()):
65
72
len_b = f_in .read (4 )
66
73
(length ,) = struct .unpack ("i" , len_b )
67
74
f_in .read (length )
68
75
76
+
69
77
def copy_all_data (f_out , f_in ):
70
78
while True :
71
79
buf = f_in .read (1024 * 1024 )
72
80
if not buf :
73
81
break
74
82
f_out .write (buf )
75
83
84
+
76
85
def convert_one_file (path_in , tokenizer ):
77
86
path_tmp = f"{ path_in } .tmp"
78
- path_orig = f"{ path_in } .orig"
87
+ path_orig = f"{ path_in } .orig"
79
88
print (f"converting { path_in } " )
80
89
with open (path_in , "rb" ) as f_in , open (path_tmp , "wb" ) as f_out :
81
90
write_header (f_out , read_header (f_in ))
@@ -85,6 +94,7 @@ def convert_one_file(path_in, tokenizer):
85
94
os .rename (path_in , path_orig )
86
95
os .rename (path_tmp , path_in )
87
96
97
+
88
98
def main ():
89
99
args = parse_args ()
90
100
files = []
@@ -96,5 +106,6 @@ def main():
96
106
for file in files :
97
107
convert_one_file (file , tokenizer )
98
108
109
+
99
110
if __name__ == "__main__" :
100
111
main ()
0 commit comments