@@ -419,8 +419,12 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]
419
419
def load_hparams (dir_model : Path ):
420
420
with open (dir_model / "config.json" , "r" , encoding = "utf-8" ) as f :
421
421
hparams = json .load (f )
422
+ architectures = hparams .get ("architectures" )
422
423
if "text_config" in hparams :
423
424
hparams = {** hparams , ** hparams ["text_config" ]}
425
+ if architectures is not None :
426
+ # preserve "architectures" from root level config
427
+ hparams ["architectures" ] = architectures
424
428
return hparams
425
429
426
430
@classmethod
@@ -1061,6 +1065,8 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab
1061
1065
class VisionModel (ModelBase ):
1062
1066
model_arch = gguf .MODEL_ARCH .CLIP_VISION
1063
1067
n_text_embd = 0
1068
+ preprocessor_config : dict [str , Any ]
1069
+ global_config : dict [str , Any ]
1064
1070
1065
1071
def __init__ (self , * args , ** kwargs ):
1066
1072
super ().__init__ (* args , ** kwargs )
@@ -1075,24 +1081,33 @@ def __init__(self, *args, **kwargs):
1075
1081
1076
1082
if "vision_config" not in self .hparams :
1077
1083
raise ValueError ("vision_config not found in hparams" )
1078
- # move vision config to the top level
1084
+ # move vision config to the top level, while preserving the original hparams in global_config
1085
+ self .global_config = self .hparams
1079
1086
self .hparams = self .hparams ["vision_config" ]
1080
1087
1088
+ # load preprocessor config
1089
+ with open (self .dir_model / "preprocessor_config.json" , "r" , encoding = "utf-8" ) as f :
1090
+ self .preprocessor_config = json .load (f )
1091
+
1081
1092
def set_type (self ):
1082
1093
self .gguf_writer .add_type (gguf .GGUFType .CLIP_VISION )
1083
1094
1084
1095
def set_gguf_parameters (self ):
1085
1096
self .gguf_writer .add_file_type (self .ftype )
1086
- self .gguf_writer .add_uint32 ( gguf . Keys . ClipVision . PROJECTION_DIM , self .n_embd_text )
1087
- self .gguf_writer .add_bool ( gguf . Keys . ClipVision . HAS_VISION_ENCODER , True )
1097
+ self .gguf_writer .add_vision_projection_dim ( self .n_embd_text )
1098
+ self .gguf_writer .add_vision_has_vision_encoder ( True )
1088
1099
1089
1100
# vision config
1090
- self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .IMAGE_SIZE , self .find_hparam (["image_size" ]))
1091
- self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .PATCH_SIZE , self .find_hparam (["patch_size" ]))
1092
- self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .EMBEDDING_LENGTH , self .find_hparam (["hidden_size" ]))
1093
- self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .FEED_FORWARD_LENGTH , self .find_hparam (["intermediate_size" ]))
1094
- self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .BLOCK_COUNT , self .find_hparam (["num_hidden_layers" ]))
1095
- self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .Attention .HEAD_COUNT , self .find_hparam (["num_attention_heads" ]))
1101
+ self .gguf_writer .add_vision_image_size (self .find_hparam (["image_size" ]))
1102
+ self .gguf_writer .add_vision_patch_size (self .find_hparam (["patch_size" ]))
1103
+ self .gguf_writer .add_vision_embedding_length (self .find_hparam (["hidden_size" ]))
1104
+ self .gguf_writer .add_vision_feed_forward_length (self .find_hparam (["intermediate_size" ]))
1105
+ self .gguf_writer .add_vision_block_count (self .find_hparam (["num_hidden_layers" ]))
1106
+ self .gguf_writer .add_vision_head_count (self .find_hparam (["num_attention_heads" ]))
1107
+
1108
+ # preprocessor config
1109
+ self .gguf_writer .add_vision_image_mean (self .preprocessor_config ["image_mean" ])
1110
+ self .gguf_writer .add_vision_image_std (self .preprocessor_config ["image_mean" ])
1096
1111
1097
1112
def write_vocab (self ):
1098
1113
raise ValueError ("VisionModel does not support vocab writing" )
@@ -1703,11 +1718,23 @@ def prepare_tensors(self):
1703
1718
raise ValueError (f"Unprocessed norms: { norms } " )
1704
1719
1705
1720
1706
- @ModelBase .register ("LLaMAForCausalLM" , "LlamaForCausalLM" , "MistralForCausalLM" , "MixtralForCausalLM" )
1721
+ @ModelBase .register (
1722
+ "LLaMAForCausalLM" ,
1723
+ "LlamaForCausalLM" ,
1724
+ "MistralForCausalLM" ,
1725
+ "MixtralForCausalLM" ,
1726
+ "Idefics3ForConditionalGeneration" ,
1727
+ "SmolVLMForConditionalGeneration" )
1707
1728
class LlamaModel (TextModel ):
1708
1729
model_arch = gguf .MODEL_ARCH .LLAMA
1709
1730
undo_permute = True
1710
1731
1732
+ def __init__ (self , * args , ** kwargs ):
1733
+ super ().__init__ (* args , ** kwargs )
1734
+ # fix for SmolVLM2, missing `num_attention_heads` in config.json
1735
+ if self .hparams ["architectures" ][0 ] == "SmolVLMForConditionalGeneration" :
1736
+ self .hparams ["num_attention_heads" ] = self .hparams .get ("num_attention_heads" , 32 )
1737
+
1711
1738
def set_vocab (self ):
1712
1739
try :
1713
1740
self ._set_vocab_sentencepiece ()
@@ -1770,6 +1797,12 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
1770
1797
def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
1771
1798
n_head = self .hparams ["num_attention_heads" ]
1772
1799
n_kv_head = self .hparams .get ("num_key_value_heads" )
1800
+ is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name
1801
+
1802
+ if is_vision_tensor :
1803
+ return [] # skip vision tensors
1804
+ elif name .startswith ("model.text_model" ):
1805
+ name = name .replace ("text_model." , "" ) # for SmolVLM
1773
1806
1774
1807
if self .undo_permute :
1775
1808
if name .endswith (("q_proj.weight" , "q_proj.bias" )):
@@ -1852,6 +1885,41 @@ def prepare_tensors(self):
1852
1885
raise ValueError (f"Unprocessed experts: { experts } " )
1853
1886
1854
1887
1888
+ @ModelBase .register ("Idefics3ForConditionalGeneration" , "SmolVLMForConditionalGeneration" )
1889
+ class SmolVLMModel (VisionModel ):
1890
+ def __init__ (self , * args , ** kwargs ):
1891
+ super ().__init__ (* args , ** kwargs )
1892
+ # fix for SmolVLM2, missing some keys in config.json
1893
+ # default values are taken from transformers code
1894
+ if self .hparams ["model_type" ] == "smolvlm_vision" :
1895
+ self .hparams ["hidden_size" ] = self .hparams .get ("hidden_size" , 1152 )
1896
+ self .hparams ["num_attention_heads" ] = self .hparams .get ("num_attention_heads" , 16 )
1897
+ self .hparams ["intermediate_size" ] = self .hparams .get ("intermediate_size" , 3072 )
1898
+ self .hparams ["num_hidden_layers" ] = self .hparams .get ("num_hidden_layers" , 12 )
1899
+
1900
+ def set_gguf_parameters (self ):
1901
+ super ().set_gguf_parameters ()
1902
+ self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .IDEFICS3 )
1903
+ self .gguf_writer .add_vision_attention_layernorm_eps (self .hparams .get ("layer_norm_eps" , 1e-5 ))
1904
+ self .gguf_writer .add_vision_projector_scale_factor (self .global_config .get ("scale_factor" , 2 ))
1905
+ self .gguf_writer .add_vision_use_gelu (True )
1906
+
1907
+ def tensor_force_quant (self , name , new_name , bid , n_dims ):
1908
+ del bid , new_name , n_dims # unused
1909
+ if ".embeddings." in name :
1910
+ return gguf .GGMLQuantizationType .F32
1911
+ return False
1912
+
1913
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
1914
+ del bid # unused
1915
+ is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name
1916
+
1917
+ if is_vision_tensor :
1918
+ return [(self .map_tensor_name (name ), data_torch )]
1919
+
1920
+ return [] # skip other tensors
1921
+
1922
+
1855
1923
@ModelBase .register ("Llama4ForConditionalGeneration" )
1856
1924
class Llama4Model (LlamaModel ):
1857
1925
model_arch = gguf .MODEL_ARCH .LLAMA4
@@ -3591,12 +3659,10 @@ class Gemma3VisionModel(VisionModel):
3591
3659
def set_gguf_parameters (self ):
3592
3660
super ().set_gguf_parameters ()
3593
3661
hparams = self .hparams
3594
- self .gguf_writer .add_string (gguf .Keys . ClipVision . PROJECTOR_TYPE , "gemma3" )
3662
+ self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType . GEMMA3 )
3595
3663
# default values below are taken from HF tranformers code
3596
- self .gguf_writer .add_float32 (gguf .Keys .ClipVision .Attention .LAYERNORM_EPS , hparams .get ("layer_norm_eps" , 1e-6 ))
3597
- self .gguf_writer .add_array (gguf .Keys .ClipVision .IMAGE_MEAN , [0.5 , 0.5 , 0.5 ])
3598
- self .gguf_writer .add_array (gguf .Keys .ClipVision .IMAGE_STD , [0.5 , 0.5 , 0.5 ])
3599
- self .gguf_writer .add_bool (gguf .Keys .ClipVision .USE_GELU , True )
3664
+ self .gguf_writer .add_vision_attention_layernorm_eps (hparams .get ("layer_norm_eps" , 1e-6 ))
3665
+ self .gguf_writer .add_vision_use_gelu (True )
3600
3666
3601
3667
def tensor_force_quant (self , name , new_name , bid , n_dims ):
3602
3668
del bid , new_name , n_dims # unused
@@ -3614,10 +3680,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
3614
3680
or name .startswith ("multimodal_projector." ) or name .startswith ("vision_model." ):
3615
3681
# process vision tensors
3616
3682
name = name .replace ("_weight" , ".weight" )
3617
- if "fc1" in name :
3618
- name = name .replace ("fc1" , "fc2" )
3619
- else :
3620
- name = name .replace ("fc2" , "fc1" )
3621
3683
3622
3684
# correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
3623
3685
# the other norm values are part of SigLIP model, and they are already correct
0 commit comments