@@ -369,14 +369,27 @@ def from_model_architecture(cls, arch: str) -> type[Model]:
369
369
except KeyError :
370
370
raise NotImplementedError (f'Architecture { arch !r} not supported!' ) from None
371
371
372
+ def _is_model_safetensors (self ) -> bool :
373
+ return Model .count_model_parts (self .dir_model , ".safetensors" ) > 0
374
+
375
+ def _get_part_names (self ):
376
+ if self .is_safetensors :
377
+ if self .num_parts == 1 : # there's only one .safetensors file
378
+ return ("model.safetensors" ,)
379
+ return (f"model-{ n :05} -of-{ self .num_parts :05} .safetensors" for n in range (1 , self .num_parts + 1 ))
380
+
381
+ if self .num_parts == 1 : # there's only one .bin file
382
+ return ("pytorch_model.bin" ,)
383
+ return (f"pytorch_model-{ n :05} -of-{ self .num_parts :05} .bin" for n in range (1 , self .num_parts + 1 ))
384
+
372
385
# used for GPT-2 BPE and WordPiece vocabs
373
386
def get_vocab_base (self ) -> tuple [list [str ], list [int ], str ]:
374
387
tokens : list [str ] = []
375
388
toktypes : list [int ] = []
376
389
377
390
from transformers import AutoTokenizer
378
- tokenizer = AutoTokenizer .from_pretrained (dir_model )
379
- vocab_size = hparams .get ("vocab_size" , len (tokenizer .vocab ))
391
+ tokenizer = AutoTokenizer .from_pretrained (self . dir_model )
392
+ vocab_size = self . hparams .get ("vocab_size" , len (tokenizer .vocab ))
380
393
assert max (tokenizer .vocab .values ()) < vocab_size
381
394
382
395
tokpre = self .get_vocab_base_pre (tokenizer )
@@ -403,7 +416,6 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
403
416
# NOTE: this function is generated by convert-hf-to-gguf-update.py
404
417
# do not modify it manually!
405
418
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
406
- # Marker: Start get_vocab_base_pre
407
419
def get_vocab_base_pre (self , tokenizer ) -> str :
408
420
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
409
421
# is specific for the BPE pre-tokenizer used by the model
@@ -415,14 +427,13 @@ def get_vocab_base_pre(self, tokenizer) -> str:
415
427
chktok = tokenizer .encode (chktxt )
416
428
chkhsh = sha256 (str (chktok ).encode ()).hexdigest ()
417
429
418
- logger . debug (f"chktok: { chktok } " )
419
- logger . debug (f"chkhsh: { chkhsh } " )
430
+ print (f"chktok: { chktok } " )
431
+ print (f"chkhsh: { chkhsh } " )
420
432
421
433
res = None
422
434
423
- # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
424
- # or pull the latest version of the model from Huggingface
425
- # don't edit the hashes manually!
435
+ # NOTE: if you get an error here, you need to add the model to the if-elif chain below
436
+ # don't do this manually - use the convert-hf-to-gguf-update.py script!
426
437
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5" :
427
438
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
428
439
res = "llama-bpe"
@@ -447,60 +458,23 @@ def get_vocab_base_pre(self, tokenizer) -> str:
447
458
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454" :
448
459
# ref: https://huggingface.co/openai-community/gpt2
449
460
res = "gpt-2"
450
- if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3" :
451
- # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
452
- res = "stablelm2"
453
- if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff" :
454
- # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
455
- res = "refact"
456
- if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8" :
457
- # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
458
- res = "command-r"
459
- if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea" :
460
- # ref: https://huggingface.co/Qwen/Qwen1.5-7B
461
- res = "qwen2"
462
- if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166" :
463
- # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
464
- res = "olmo"
465
- if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e" :
466
- # ref: https://huggingface.co/databricks/dbrx-base
467
- res = "dbrx"
468
- if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f" :
469
- # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
470
- res = "jina-v2-en"
471
- if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643" :
472
- # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
473
- res = "jina-v2-es"
474
- if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6" :
475
- # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
476
- res = "jina-v2-de"
477
- if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d" :
478
- # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
479
- res = "smaug-bpe"
480
- if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a" :
481
- # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
482
- res = "jina-v2-code"
483
461
484
462
if res is None :
485
- logger .warning ("\n " )
486
- logger .warning ("**************************************************************************************" )
487
- logger .warning ("** WARNING: The BPE pre-tokenizer was not recognized!" )
488
- logger .warning ("** There are 2 possible reasons for this:" )
489
- logger .warning ("** - the model has not been added to convert-hf-to-gguf-update.py yet" )
490
- logger .warning ("** - the pre-tokenization config has changed upstream" )
491
- logger .warning ("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly." )
492
- logger .warning ("** ref: https://github.com/ggerganov/llama.cpp/pull/6920" )
493
- logger .warning ("**" )
494
- logger .warning (f"** chkhsh: { chkhsh } " )
495
- logger .warning ("**************************************************************************************" )
496
- logger .warning ("\n " )
463
+ print ("\n " )
464
+ print ("**************************************************************************************" )
465
+ print ("** WARNING: The BPE pre-tokenizer was not recognized!" )
466
+ print ("** This means that it was not added yet or you are using an older version." )
467
+ print ("** Check convert-hf-to-gguf-update.py and update it accordingly." )
468
+ print ("**" )
469
+ print (f"** chkhsh: { chkhsh } " )
470
+ print ("**************************************************************************************" )
471
+ print ("\n " )
497
472
raise NotImplementedError ("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()" )
498
473
499
- logger . debug (f"tokenizer.ggml.pre: { repr ( res ) } " )
500
- logger . debug (f"chkhsh: { chkhsh } " )
474
+ print (f"tokenizer.ggml.pre: { res } " )
475
+ print (f"chkhsh: { chkhsh } " )
501
476
502
477
return res
503
- # Marker: End get_vocab_base_pre
504
478
505
479
def _set_vocab_gpt2 (self ) -> None :
506
480
tokens , toktypes , tokpre = self .get_vocab_base ()
@@ -509,7 +483,7 @@ def _set_vocab_gpt2(self) -> None:
509
483
self .gguf_writer .add_token_list (tokens )
510
484
self .gguf_writer .add_token_types (toktypes )
511
485
512
- special_vocab = gguf .SpecialVocab (dir_model , load_merges = True )
486
+ special_vocab = gguf .SpecialVocab (self . dir_model , load_merges = True )
513
487
special_vocab .add_to_gguf (self .gguf_writer )
514
488
515
489
def _set_vocab_qwen (self ):
@@ -2163,35 +2137,26 @@ def set_gguf_parameters(self):
2163
2137
self .gguf_writer .add_pooling_type (pooling_type )
2164
2138
2165
2139
def set_vocab (self ):
2166
- # use huggingface vocab to get all tokens
2167
- vocab = LlamaHfVocab (self .dir_model , ignore_nonllama = True )
2168
- tokens , scores , toktypes = zip (* vocab .all_tokens ())
2169
- assert len (tokens ) == vocab .vocab_size
2170
- self .vocab_size = vocab .vocab_size
2140
+ tokens , toktypes , tokpre = self .get_vocab_base ()
2141
+ self .vocab_size = len (tokens )
2171
2142
2172
2143
# we need this to validate the size of the token_type embeddings
2173
2144
# though currently we are passing all zeros to the token_type embeddings
2174
- n_token_types = len (set (toktypes ))
2175
- self .gguf_writer .add_token_type_count (n_token_types )
2145
+ self .gguf_writer .add_token_type_count (2 ) # "Sequence A" or "Sequence B"
2176
2146
2177
2147
# convert to phantom space vocab
2178
- def phantom (tok , typ ):
2179
- if tok .startswith (b "[" ) and tok .endswith (b "]" ):
2148
+ def phantom (tok ):
2149
+ if tok .startswith ("[" ) and tok .endswith ("]" ):
2180
2150
return tok
2181
- if tok .startswith (b "##" ):
2151
+ if tok .startswith ("##" ):
2182
2152
return tok [2 :]
2183
- return b"\xe2 \x96 \x81 " + tok
2184
- tokens = tuple (phantom (t , y ) for t , y in zip (tokens , toktypes ))
2185
-
2186
- # set up bos and eos tokens (cls and sep)
2187
- self .gguf_writer .add_bos_token_id (vocab .tokenizer .cls_token_id )
2188
- self .gguf_writer .add_eos_token_id (vocab .tokenizer .sep_token_id )
2153
+ return "\u2581 " + tok
2154
+ tokens = list (map (phantom , tokens ))
2189
2155
2190
2156
# add vocab to gguf
2191
2157
self .gguf_writer .add_tokenizer_model ("bert" )
2192
2158
self .gguf_writer .add_tokenizer_pre (tokpre )
2193
2159
self .gguf_writer .add_token_list (tokens )
2194
- self .gguf_writer .add_token_scores (scores )
2195
2160
self .gguf_writer .add_token_types (toktypes )
2196
2161
2197
2162
# handle special tokens
@@ -2237,16 +2202,6 @@ def set_gguf_parameters(self):
2237
2202
super ().set_gguf_parameters ()
2238
2203
self .gguf_writer .add_rope_freq_base (self .hparams ["rotary_emb_base" ])
2239
2204
2240
- def get_tensors (self ):
2241
- assert self .vocab_size is not None
2242
- for name , data in super ().get_tensors ():
2243
- # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
2244
- if name == 'embeddings.word_embeddings.weight' and data .shape [1 ] != self .vocab_size :
2245
- rounded_vocab_size = (self .vocab_size + 63 ) // 64 * 64
2246
- assert data .shape == (rounded_vocab_size , self .hparams ["n_embd" ])
2247
- data = data [:self .vocab_size , :]
2248
- yield name , data
2249
-
2250
2205
2251
2206
@Model .register ("GemmaForCausalLM" )
2252
2207
class GemmaModel (Model ):
@@ -2409,15 +2364,25 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
2409
2364
def extra_f32_tensors (self , name : str , new_name : str , bid : int | None , n_dims : int ) -> bool :
2410
2365
del n_dims # unused
2411
2366
2412
- return bid is not None and new_name in (
2413
- self .format_tensor_name (n , bid , ".weight" if name .endswith (".weight" ) else "" ) for n in [
2414
- gguf .MODEL_TENSOR .SSM_CONV1D ,
2415
- gguf .MODEL_TENSOR .SSM_X ,
2416
- gguf .MODEL_TENSOR .SSM_DT ,
2417
- gguf .MODEL_TENSOR .SSM_A ,
2418
- gguf .MODEL_TENSOR .SSM_D ,
2419
- ]
2420
- )
2367
+ n_dims = len (data .shape )
2368
+ data_dtype = data .dtype
2369
+
2370
+ # if f32 desired, convert any float16 to float32
2371
+ if self .ftype == 0 and data_dtype == np .float16 :
2372
+ data = data .astype (np .float32 )
2373
+
2374
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2375
+ if self .ftype == 1 and data_dtype == np .float16 and n_dims == 1 :
2376
+ data = data .astype (np .float32 )
2377
+
2378
+ # if f16 desired, convert big float32 2-dim weight tensors to float16
2379
+ new_weight_name = new_name [:- len (".weight" )] if new_name .endswith (".weight" ) else ""
2380
+ if self .ftype == 1 and data_dtype == np .float32 and new_weight_name .endswith ((".ssm_in" , ".ssm_out" , "token_embd" , "output" )) and n_dims == 2 :
2381
+ data = data .astype (np .float16 )
2382
+
2383
+ print (f"{ new_name } , n_dims = { n_dims } , { old_dtype } --> { data .dtype } " )
2384
+
2385
+ self .gguf_writer .add_tensor (new_name , data )
2421
2386
2422
2387
2423
2388
@Model .register ("CohereForCausalLM" )
0 commit comments