@@ -49,9 +49,8 @@ class Model(Protocol):
49
49
is_big_endian : bool
50
50
endianess : gguf .GGUFEndian
51
51
use_temp_file : bool
52
+ part_names : list [str ]
52
53
is_safetensors : bool
53
- num_parts : int
54
- part_names : Iterable [str ]
55
54
hparams : dict [str , Any ]
56
55
gguf_writer : gguf .GGUFWriter
57
56
block_count : int
@@ -67,9 +66,10 @@ def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian:
67
66
self .is_big_endian = is_big_endian
68
67
self .endianess = gguf .GGUFEndian .BIG if is_big_endian else gguf .GGUFEndian .LITTLE
69
68
self .use_temp_file = use_temp_file
70
- self .is_safetensors = self ._is_model_safetensors ()
71
- self .num_parts = Model .count_model_parts (self .dir_model , ".safetensors" if self .is_safetensors else ".bin" )
72
- self .part_names = self ._get_part_names ()
69
+ self .part_names = Model .get_model_part_names (self .dir_model , ".safetensors" )
70
+ self .is_safetensors = len (self .part_names ) > 0
71
+ if not self .is_safetensors :
72
+ self .part_names = Model .get_model_part_names (self .dir_model , ".bin" )
73
73
self .hparams = Model .load_hparams (self .dir_model )
74
74
self .gguf_writer = gguf .GGUFWriter (fname_out , gguf .MODEL_ARCH_NAMES [self .model_arch ], endianess = self .endianess , use_temp_file = self .use_temp_file )
75
75
self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" ])
@@ -109,7 +109,7 @@ def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suf
109
109
sys .exit ()
110
110
if "{bid}" in name :
111
111
assert bid is not None
112
- name = name .format (bid )
112
+ name = name .format (bid = bid )
113
113
return name + suffix
114
114
115
115
def map_tensor_name (self , name : str , try_suffixes : Sequence [str ] = (".weight" , ".bias" )) -> str :
@@ -228,13 +228,13 @@ def write_vocab(self):
228
228
self .gguf_writer .close ()
229
229
230
230
@staticmethod
231
- def count_model_parts (dir_model : Path , prefix : str ) -> int :
232
- num_parts = 0
231
+ def get_model_part_names (dir_model : Path , suffix : str ) -> list [ str ] :
232
+ part_names : list [ str ] = []
233
233
for filename in os .listdir (dir_model ):
234
- if filename .endswith (prefix ):
235
- num_parts += 1
234
+ if filename .endswith (suffix ):
235
+ part_names . append ( filename )
236
236
237
- return num_parts
237
+ return part_names
238
238
239
239
@staticmethod
240
240
def load_hparams (dir_model ):
@@ -258,19 +258,6 @@ def from_model_architecture(cls, arch):
258
258
except KeyError :
259
259
raise NotImplementedError (f'Architecture { arch !r} not supported!' ) from None
260
260
261
- def _is_model_safetensors (self ) -> bool :
262
- return Model .count_model_parts (self .dir_model , ".safetensors" ) > 0
263
-
264
- def _get_part_names (self ) -> Iterable [str ]:
265
- if self .is_safetensors :
266
- if self .num_parts == 1 : # there's only one .safetensors file
267
- return ("model.safetensors" ,)
268
- return (f"model-{ n :05} -of-{ self .num_parts :05} .safetensors" for n in range (1 , self .num_parts + 1 ))
269
-
270
- if self .num_parts == 1 : # there's only one .bin file
271
- return ("pytorch_model.bin" ,)
272
- return (f"pytorch_model-{ n :05} -of-{ self .num_parts :05} .bin" for n in range (1 , self .num_parts + 1 ))
273
-
274
261
# used for GPT-2 BPE and WordPiece vocabs
275
262
def get_vocab_base (self ) -> tuple [list [str ], list [int ], str ]:
276
263
tokens : list [str ] = []
@@ -446,7 +433,7 @@ def _set_vocab_sentencepiece(self):
446
433
raise FileNotFoundError (f"File not found: { tokenizer_path } " )
447
434
448
435
tokenizer = SentencePieceProcessor ()
449
- tokenizer .LoadFromFile (tokenizer_path )
436
+ tokenizer .LoadFromFile (str ( tokenizer_path ) )
450
437
451
438
vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
452
439
@@ -1120,7 +1107,7 @@ def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_
1120
1107
ename = f"model.layers.{ bid } .self_attn.{ layer_name } .norms.{ xid } .weight"
1121
1108
datas .append (norms [ename ])
1122
1109
del norms [ename ]
1123
- data_torch = torch .cat (datas , dim = 0 )
1110
+ data_torch = torch .stack (datas , dim = 0 )
1124
1111
1125
1112
merged_name = f"model.layers.{ bid } .self_attn.{ layer_name } .weight"
1126
1113
new_name = self .map_tensor_name (merged_name )
@@ -1204,7 +1191,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
1204
1191
assert bid is not None
1205
1192
1206
1193
if self ._experts is None :
1207
- self ._experts = [{} for _ in range (n_experts )]
1194
+ self ._experts = [{} for _ in range (self . block_count )]
1208
1195
1209
1196
self ._experts [bid ][name ] = data_torch
1210
1197
@@ -1220,7 +1207,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
1220
1207
datas .append (self ._experts [bid ][ename ])
1221
1208
del self ._experts [bid ][ename ]
1222
1209
1223
- data_torch = torch .cat (datas , dim = 0 )
1210
+ data_torch = torch .stack (datas , dim = 0 )
1224
1211
1225
1212
merged_name = f"layers.{ bid } .feed_forward.experts.{ wid } .weight"
1226
1213
@@ -1267,7 +1254,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
1267
1254
assert bid is not None
1268
1255
1269
1256
if self ._experts is None :
1270
- self ._experts = [{} for _ in range (n_experts )]
1257
+ self ._experts = [{} for _ in range (self . block_count )]
1271
1258
1272
1259
self ._experts [bid ][name ] = data_torch
1273
1260
@@ -1283,7 +1270,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
1283
1270
datas .append (self ._experts [bid ][ename ])
1284
1271
del self ._experts [bid ][ename ]
1285
1272
1286
- data_torch = torch .cat (datas , dim = 0 )
1273
+ data_torch = torch .stack (datas , dim = 0 )
1287
1274
1288
1275
merged_name = f"transformer.decoder_layer.{ bid } .moe.{ wid } .weight"
1289
1276
@@ -1484,7 +1471,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
1484
1471
assert bid is not None
1485
1472
1486
1473
if self ._experts is None :
1487
- self ._experts = [{} for _ in range (n_experts )]
1474
+ self ._experts = [{} for _ in range (self . block_count )]
1488
1475
1489
1476
self ._experts [bid ][name ] = data_torch
1490
1477
@@ -1500,7 +1487,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
1500
1487
datas .append (self ._experts [bid ][ename ])
1501
1488
del self ._experts [bid ][ename ]
1502
1489
1503
- data_torch = torch .cat (datas , dim = 0 )
1490
+ data_torch = torch .stack (datas , dim = 0 )
1504
1491
1505
1492
merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
1506
1493
@@ -1604,7 +1591,7 @@ def set_vocab(self):
1604
1591
sys .exit (1 )
1605
1592
1606
1593
tokenizer = SentencePieceProcessor ()
1607
- tokenizer .LoadFromFile (tokenizer_path )
1594
+ tokenizer .LoadFromFile (str ( tokenizer_path ) )
1608
1595
1609
1596
vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
1610
1597
@@ -1786,7 +1773,7 @@ def set_vocab(self):
1786
1773
add_prefix = sentencepiece_model .normalizer_spec .add_dummy_prefix
1787
1774
1788
1775
tokenizer = SentencePieceProcessor ()
1789
- tokenizer .LoadFromFile (tokenizer_path )
1776
+ tokenizer .LoadFromFile (str ( tokenizer_path ) )
1790
1777
tokenizer .serialized_model_proto
1791
1778
1792
1779
vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
@@ -2171,13 +2158,15 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
2171
2158
def extra_f32_tensors (self , name : str , new_name : str , bid : int | None , n_dims : int ) -> bool :
2172
2159
del n_dims # unused
2173
2160
2174
- return new_name in (self .format_tensor_name (n , bid , ".weight" if name .endswith (".weight" ) else "" ) for n in [
2175
- gguf .MODEL_TENSOR .SSM_CONV1D ,
2176
- gguf .MODEL_TENSOR .SSM_X ,
2177
- gguf .MODEL_TENSOR .SSM_DT ,
2178
- gguf .MODEL_TENSOR .SSM_A ,
2179
- gguf .MODEL_TENSOR .SSM_D ,
2180
- ])
2161
+ return bid is not None and new_name in (
2162
+ self .format_tensor_name (n , bid , ".weight" if name .endswith (".weight" ) else "" ) for n in [
2163
+ gguf .MODEL_TENSOR .SSM_CONV1D ,
2164
+ gguf .MODEL_TENSOR .SSM_X ,
2165
+ gguf .MODEL_TENSOR .SSM_DT ,
2166
+ gguf .MODEL_TENSOR .SSM_A ,
2167
+ gguf .MODEL_TENSOR .SSM_D ,
2168
+ ]
2169
+ )
2181
2170
2182
2171
2183
2172
@Model .register ("CohereForCausalLM" )
0 commit comments