@@ -1898,6 +1898,55 @@ def prepare_tensors(self):
1898
1898
raise ValueError (f"Unprocessed experts: { experts } " )
1899
1899
1900
1900
1901
+ @ModelBase .register ("LlavaForConditionalGeneration" )
1902
+ class LlavaVisionModel (VisionModel ):
1903
+ img_break_tok_id = - 1
1904
+
1905
+ def __init__ (self , * args , ** kwargs ):
1906
+ super ().__init__ (* args , ** kwargs )
1907
+ if self .hparams ["model_type" ] == "pixtral" :
1908
+ # fix missing config.json values
1909
+ self .hparams ["num_attention_heads" ] = self .hparams .get ("num_attention_heads" , 16 )
1910
+ self .hparams ["num_hidden_layers" ] = self .hparams .get ("num_hidden_layers" , 24 )
1911
+ self .hparams ["intermediate_size" ] = self .hparams .get ("intermediate_size" , 4096 )
1912
+ self .hparams ["hidden_size" ] = self .hparams .get ("hidden_size" , 1024 )
1913
+ self .hparams ["layer_norm_eps" ] = self .hparams .get ("layer_norm_eps" , 1e-5 )
1914
+ self .img_break_tok_id = 12 # see tokenizer_config.json
1915
+ else :
1916
+ raise ValueError (f"Unsupported model type: { self .hparams ['model_type' ]} " )
1917
+
1918
+ def set_gguf_parameters (self ):
1919
+ super ().set_gguf_parameters ()
1920
+ hparams = self .hparams
1921
+ if hparams ["model_type" ] == "pixtral" :
1922
+ self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .PIXTRAL )
1923
+ # default values below are taken from HF tranformers code
1924
+ self .gguf_writer .add_vision_attention_layernorm_eps (hparams ["layer_norm_eps" ])
1925
+ self .gguf_writer .add_vision_use_silu (True )
1926
+
1927
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
1928
+ del bid # unused
1929
+ n_head = self .hparams ["num_attention_heads" ]
1930
+ n_kv_head = n_head
1931
+
1932
+ if name .startswith ("multi_modal_projector." ) or name .startswith ("vision_tower." ):
1933
+ # process vision tensors
1934
+ if name .endswith (("q_proj.weight" , "q_proj.bias" )):
1935
+ data_torch = LlamaModel .permute (data_torch , n_head , n_head )
1936
+ if name .endswith (("k_proj.weight" , "k_proj.bias" )):
1937
+ data_torch = LlamaModel .permute (data_torch , n_head , n_kv_head )
1938
+ return [(self .map_tensor_name (name ), data_torch )]
1939
+
1940
+ if self .img_break_tok_id > 0 and "embed_tokens.weight" in name :
1941
+ logger .info (f"Extracting [IMG_BREAK] token embedding from { name } " )
1942
+ # for pixtral model, we need to extract the [IMG_BREAK] token embedding
1943
+ img_break_embd = data_torch [self .img_break_tok_id ]
1944
+ name = gguf .TENSOR_NAMES [gguf .MODEL_TENSOR .V_TOK_EMBD_IMG_BREAK ]
1945
+ return [(self .map_tensor_name (name ), img_break_embd )]
1946
+
1947
+ return [] # skip other tensors
1948
+
1949
+
1901
1950
@ModelBase .register ("Idefics3ForConditionalGeneration" , "SmolVLMForConditionalGeneration" )
1902
1951
class SmolVLMModel (VisionModel ):
1903
1952
def __init__ (self , * args , ** kwargs ):
0 commit comments