@@ -1008,6 +1008,29 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab
1008
1008
self .gguf_writer .add_add_eos_token (field .parts [- 1 ].tolist ()[0 ])
1009
1009
1010
1010
1011
+ # TODO: maybe merge this with Model in the future
1012
+ class VisionModelHelper :
1013
+ model : Model
1014
+ tok_embd_tensor : Tensor | None = None
1015
+
1016
+ def __init__ (self , model : Model ):
1017
+ self .model = model
1018
+ # TODO: how to do this without reading the whole safetensor file?
1019
+ for tname , tensor in model .get_tensors ():
1020
+ if tname .endswith ("embed_tokens.weight" ):
1021
+ self .tok_embd_tensor = tensor
1022
+
1023
+ def get_embd_for_tokens (self , map_token_to_tensor_name : Iterable [tuple [str , gguf .MODEL_TENSOR ]], tensor_name_postfix = '.weight' ) -> Iterable [tuple [str , Tensor ]]:
1024
+ if self .tok_embd_tensor is None :
1025
+ raise ValueError ("Token embedding tensor not found" )
1026
+ from transformers import AutoTokenizer
1027
+ tokenizer = AutoTokenizer .from_pretrained (self .model .dir_model , trust_remote_code = True )
1028
+ for token , tensor_name in map_token_to_tensor_name :
1029
+ tok_id = tokenizer .get_vocab ()[token ]
1030
+ row = self .tok_embd_tensor [tok_id ]
1031
+ yield gguf .TENSOR_NAMES [tensor_name ] + tensor_name_postfix , row
1032
+
1033
+
1011
1034
@Model .register ("GPTNeoXForCausalLM" )
1012
1035
class GPTNeoXModel (Model ):
1013
1036
model_arch = gguf .MODEL_ARCH .GPTNEOX
@@ -2355,11 +2378,11 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
2355
2378
2356
2379
@Model .register ("MiniCPMV" )
2357
2380
class MiniCPMVModel (Qwen2Model ):
2358
- # based on minicpmv-surgery.py, not sure why it is Qwen2Model instead of MiniCPMModel
2381
+ # MiniCPM-V 2.5 is Qwen2 and 2.6 is Qwen-2.5
2359
2382
model_arch = gguf .MODEL_ARCH .QWEN2
2360
2383
proj_type : gguf .constants .CLIPProjectorType | None
2361
2384
resampler_n_embd = 0
2362
- tok_embd_tensor : Tensor | None = None
2385
+ vhelper : VisionModelHelper | None
2363
2386
2364
2387
def __init__ (self , * args , ** kwargs ):
2365
2388
super ().__init__ (* args , ** kwargs )
@@ -2378,56 +2401,49 @@ def __init__(self, *args, **kwargs):
2378
2401
self .proj_type = gguf .constants .CLIPProjectorType .MINICPMV_2_6
2379
2402
else :
2380
2403
raise ValueError (f"Unsupported MiniCPM-V version: { version } " )
2404
+ self .vhelper = VisionModelHelper (self )
2381
2405
# TODO: how to do this without reading the whole safetensor file?
2382
2406
for tname , tensor in self .get_tensors ():
2383
2407
if tname == "resampler.ln_post.bias" :
2384
2408
self .resampler_n_embd = tensor .shape [0 ]
2385
- if tname .endswith ("embed_tokens.weight" ):
2386
- self .tok_embd_tensor = tensor
2387
2409
if self .resampler_n_embd < 2 :
2388
2410
raise ValueError ("Failed to detect resampler embedding size" )
2389
2411
else :
2390
2412
raise ValueError ("Expected vision_config, but not found" )
2391
2413
2392
- if self .vparams is not None and self .vision_arch is not None and self .preprocessor_config is not None :
2393
- self .preprocessor_config ["image_mean" ] = [0.5 , 0.5 , 0.5 ]
2394
- self .preprocessor_config ["image_std" ] = [0.5 , 0.5 , 0.5 ]
2395
- self .hparams ["vision_feature_layer" ] = 0
2396
- self .v_tensor_map = gguf .get_tensor_name_map (self .vision_arch , self .vparams ["num_hidden_layers" ])
2397
-
2398
- def get_embd_of_tokens (self , map_token_to_tensor_name : Iterable [tuple [str , str ]]) -> Iterable [tuple [str , Tensor ]]:
2399
- if self .tok_embd_tensor is None :
2400
- raise ValueError ("Token embedding tensor not found" )
2401
- from transformers import AutoTokenizer
2402
- tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
2403
- for token , tensor_name in map_token_to_tensor_name :
2404
- tok_id = tokenizer .get_vocab ()[token ]
2405
- row = self .tok_embd_tensor [tok_id ]
2406
- yield tensor_name , row
2414
+ assert self .vparams is not None
2415
+ assert self .vision_arch is not None
2416
+ assert self .preprocessor_config is not None
2417
+ self .preprocessor_config ["image_mean" ] = [0.5 , 0.5 , 0.5 ]
2418
+ self .preprocessor_config ["image_std" ] = [0.5 , 0.5 , 0.5 ]
2419
+ self .hparams ["vision_feature_layer" ] = 0
2420
+ self .v_tensor_map = gguf .get_tensor_name_map (self .vision_arch , self .vparams ["num_hidden_layers" ])
2407
2421
2408
2422
def set_gguf_parameters (self ):
2409
2423
super ().set_gguf_parameters ()
2410
- # For vision model
2411
- if self .vparams is not None and self .proj_type is not None :
2412
- self .gguf_writer .add_vision_vit_patch_merge_type (gguf .CLIPPatchMergeType .FLAT )
2413
- self .gguf_writer .add_vision_vit_projector_type (self .proj_type )
2414
- self .gguf_writer .add_vision_vit_layer_norm_epsilon (1e-06 )
2415
- max_pos_embd = (self .vparams ["image_size" ] // self .vparams ["patch_size" ])** 2
2416
- self .gguf_writer .add_vision_vit_max_position_embeddings (max_pos_embd )
2424
+ assert self .vparams is not None and self .proj_type is not None
2425
+ self .gguf_writer .add_vision_vit_patch_merge_type (gguf .CLIPPatchMergeType .FLAT )
2426
+ self .gguf_writer .add_vision_vit_projector_type (self .proj_type )
2427
+ self .gguf_writer .add_vision_vit_layer_norm_epsilon (1e-06 )
2428
+ max_pos_embd = (self .vparams ["image_size" ] // self .vparams ["patch_size" ])** 2
2429
+ self .gguf_writer .add_vision_vit_max_position_embeddings (max_pos_embd )
2417
2430
2418
2431
2419
2432
def generate_extra_tensors (self ) -> Iterable [tuple [str , Tensor ]]:
2433
+ # because the model operates excusively on 70x70 patches for now, we should precompute the positional embeddings to gain performance
2434
+ # in the future, we can do it in cpp if we figure out how to do it efficiently
2420
2435
yield (
2421
2436
self .format_tensor_name (gguf .MODEL_TENSOR .V_RESMPL_POS_EMBD_K , is_vision = True ),
2422
2437
torch .from_numpy (self ._get_2d_sincos_pos_embed (self .resampler_n_embd , (70 , 70 )))
2423
2438
)
2439
+ assert self .vhelper is not None
2424
2440
added_tokens = [
2425
- ("<image>" , gguf .TENSOR_NAMES [ gguf . MODEL_TENSOR .V_TOK_EMBD_IMAGE ] + ".weight" ),
2426
- ("</image>" , gguf .TENSOR_NAMES [ gguf . MODEL_TENSOR .V_TOK_EMBD_END_IMAGE ] + ".weight" ),
2427
- ("<slice>" , gguf .TENSOR_NAMES [ gguf . MODEL_TENSOR .V_TOK_EMBD_SLICE ] + ".weight" ),
2428
- ("</slice>" , gguf .TENSOR_NAMES [ gguf . MODEL_TENSOR .V_TOK_EMBD_END_SLICE ] + ".weight" ),
2441
+ ("<image>" , gguf .MODEL_TENSOR .V_TOK_EMBD_IMAGE ),
2442
+ ("</image>" , gguf .MODEL_TENSOR .V_TOK_EMBD_END_IMAGE ),
2443
+ ("<slice>" , gguf .MODEL_TENSOR .V_TOK_EMBD_SLICE ),
2444
+ ("</slice>" , gguf .MODEL_TENSOR .V_TOK_EMBD_END_SLICE ),
2429
2445
]
2430
- for tensor_name , tensor in self .get_embd_of_tokens (added_tokens ):
2446
+ for tensor_name , tensor in self .vhelper . get_embd_for_tokens (added_tokens ):
2431
2447
yield tensor_name , tensor
2432
2448
2433
2449
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
0 commit comments