Skip to content

Commit dc39a5e

Browse files
authored
mtmd : support SmolVLM (version 1 and 2) (#13050)
* mtmd : support SmolVLM (version 1 and 2) * correct chat template * fix n_patches * scale_factor is an int * add more models to test
1 parent ab47dec commit dc39a5e

File tree

10 files changed

+279
-65
lines changed

10 files changed

+279
-65
lines changed

convert_hf_to_gguf.py

Lines changed: 81 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -419,8 +419,12 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]
419419
def load_hparams(dir_model: Path):
420420
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
421421
hparams = json.load(f)
422+
architectures = hparams.get("architectures")
422423
if "text_config" in hparams:
423424
hparams = {**hparams, **hparams["text_config"]}
425+
if architectures is not None:
426+
# preserve "architectures" from root level config
427+
hparams["architectures"] = architectures
424428
return hparams
425429

426430
@classmethod
@@ -1061,6 +1065,8 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab
10611065
class VisionModel(ModelBase):
10621066
model_arch = gguf.MODEL_ARCH.CLIP_VISION
10631067
n_text_embd = 0
1068+
preprocessor_config: dict[str, Any]
1069+
global_config: dict[str, Any]
10641070

10651071
def __init__(self, *args, **kwargs):
10661072
super().__init__(*args, **kwargs)
@@ -1075,24 +1081,33 @@ def __init__(self, *args, **kwargs):
10751081

10761082
if "vision_config" not in self.hparams:
10771083
raise ValueError("vision_config not found in hparams")
1078-
# move vision config to the top level
1084+
# move vision config to the top level, while preserving the original hparams in global_config
1085+
self.global_config = self.hparams
10791086
self.hparams = self.hparams["vision_config"]
10801087

1088+
# load preprocessor config
1089+
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
1090+
self.preprocessor_config = json.load(f)
1091+
10811092
def set_type(self):
10821093
self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION)
10831094

10841095
def set_gguf_parameters(self):
10851096
self.gguf_writer.add_file_type(self.ftype)
1086-
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PROJECTION_DIM, self.n_embd_text)
1087-
self.gguf_writer.add_bool(gguf.Keys.ClipVision.HAS_VISION_ENCODER, True)
1097+
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
1098+
self.gguf_writer.add_vision_has_vision_encoder(True)
10881099

10891100
# vision config
1090-
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.IMAGE_SIZE, self.find_hparam(["image_size"]))
1091-
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PATCH_SIZE, self.find_hparam(["patch_size"]))
1092-
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.EMBEDDING_LENGTH, self.find_hparam(["hidden_size"]))
1093-
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.FEED_FORWARD_LENGTH, self.find_hparam(["intermediate_size"]))
1094-
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.BLOCK_COUNT, self.find_hparam(["num_hidden_layers"]))
1095-
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.Attention.HEAD_COUNT, self.find_hparam(["num_attention_heads"]))
1101+
self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
1102+
self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
1103+
self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
1104+
self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
1105+
self.gguf_writer.add_vision_block_count(self.find_hparam(["num_hidden_layers"]))
1106+
self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
1107+
1108+
# preprocessor config
1109+
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
1110+
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_mean"])
10961111

10971112
def write_vocab(self):
10981113
raise ValueError("VisionModel does not support vocab writing")
@@ -1703,11 +1718,23 @@ def prepare_tensors(self):
17031718
raise ValueError(f"Unprocessed norms: {norms}")
17041719

17051720

1706-
@ModelBase.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
1721+
@ModelBase.register(
1722+
"LLaMAForCausalLM",
1723+
"LlamaForCausalLM",
1724+
"MistralForCausalLM",
1725+
"MixtralForCausalLM",
1726+
"Idefics3ForConditionalGeneration",
1727+
"SmolVLMForConditionalGeneration")
17071728
class LlamaModel(TextModel):
17081729
model_arch = gguf.MODEL_ARCH.LLAMA
17091730
undo_permute = True
17101731

1732+
def __init__(self, *args, **kwargs):
1733+
super().__init__(*args, **kwargs)
1734+
# fix for SmolVLM2, missing `num_attention_heads` in config.json
1735+
if self.hparams["architectures"][0] == "SmolVLMForConditionalGeneration":
1736+
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
1737+
17111738
def set_vocab(self):
17121739
try:
17131740
self._set_vocab_sentencepiece()
@@ -1770,6 +1797,12 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
17701797
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
17711798
n_head = self.hparams["num_attention_heads"]
17721799
n_kv_head = self.hparams.get("num_key_value_heads")
1800+
is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name
1801+
1802+
if is_vision_tensor:
1803+
return [] # skip vision tensors
1804+
elif name.startswith("model.text_model"):
1805+
name = name.replace("text_model.", "") # for SmolVLM
17731806

17741807
if self.undo_permute:
17751808
if name.endswith(("q_proj.weight", "q_proj.bias")):
@@ -1852,6 +1885,41 @@ def prepare_tensors(self):
18521885
raise ValueError(f"Unprocessed experts: {experts}")
18531886

18541887

1888+
@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
1889+
class SmolVLMModel(VisionModel):
1890+
def __init__(self, *args, **kwargs):
1891+
super().__init__(*args, **kwargs)
1892+
# fix for SmolVLM2, missing some keys in config.json
1893+
# default values are taken from transformers code
1894+
if self.hparams["model_type"] == "smolvlm_vision":
1895+
self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152)
1896+
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
1897+
self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072)
1898+
self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 12)
1899+
1900+
def set_gguf_parameters(self):
1901+
super().set_gguf_parameters()
1902+
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.IDEFICS3)
1903+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
1904+
self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
1905+
self.gguf_writer.add_vision_use_gelu(True)
1906+
1907+
def tensor_force_quant(self, name, new_name, bid, n_dims):
1908+
del bid, new_name, n_dims # unused
1909+
if ".embeddings." in name:
1910+
return gguf.GGMLQuantizationType.F32
1911+
return False
1912+
1913+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1914+
del bid # unused
1915+
is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name
1916+
1917+
if is_vision_tensor:
1918+
return [(self.map_tensor_name(name), data_torch)]
1919+
1920+
return [] # skip other tensors
1921+
1922+
18551923
@ModelBase.register("Llama4ForConditionalGeneration")
18561924
class Llama4Model(LlamaModel):
18571925
model_arch = gguf.MODEL_ARCH.LLAMA4
@@ -3591,12 +3659,10 @@ class Gemma3VisionModel(VisionModel):
35913659
def set_gguf_parameters(self):
35923660
super().set_gguf_parameters()
35933661
hparams = self.hparams
3594-
self.gguf_writer.add_string(gguf.Keys.ClipVision.PROJECTOR_TYPE, "gemma3")
3662+
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.GEMMA3)
35953663
# default values below are taken from HF tranformers code
3596-
self.gguf_writer.add_float32(gguf.Keys.ClipVision.Attention.LAYERNORM_EPS, hparams.get("layer_norm_eps", 1e-6))
3597-
self.gguf_writer.add_array(gguf.Keys.ClipVision.IMAGE_MEAN, [0.5, 0.5, 0.5])
3598-
self.gguf_writer.add_array(gguf.Keys.ClipVision.IMAGE_STD, [0.5, 0.5, 0.5])
3599-
self.gguf_writer.add_bool (gguf.Keys.ClipVision.USE_GELU, True)
3664+
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
3665+
self.gguf_writer.add_vision_use_gelu(True)
36003666

36013667
def tensor_force_quant(self, name, new_name, bid, n_dims):
36023668
del bid, new_name, n_dims # unused
@@ -3614,10 +3680,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
36143680
or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
36153681
# process vision tensors
36163682
name = name.replace("_weight", ".weight")
3617-
if "fc1" in name:
3618-
name = name.replace("fc1", "fc2")
3619-
else:
3620-
name = name.replace("fc2", "fc1")
36213683

36223684
# correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
36233685
# the other norm values are part of SigLIP model, and they are already correct

examples/llava/clip-impl.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,13 @@
3333
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
3434
#define KEY_PROJ_DIM "clip.%s.projection_dim"
3535
#define KEY_TOKENS "tokenizer.ggml.tokens"
36-
#define KEY_N_POSITIONS "clip.text.context_length"
3736
#define KEY_IMAGE_SIZE "clip.vision.image_size"
3837
#define KEY_PATCH_SIZE "clip.vision.patch_size"
3938
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
4039
#define KEY_IMAGE_STD "clip.vision.image_std"
41-
#define KEY_PROJ_TYPE "clip.projector_type"
4240
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
41+
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
42+
#define KEY_PROJ_TYPE "clip.projector_type"
4343

4444
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
4545
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
@@ -72,6 +72,7 @@
7272
#define TN_IMAGE_NEWLINE "model.image_newline"
7373
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
7474
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
75+
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
7576

7677
// mimicpmv
7778
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
@@ -99,6 +100,7 @@ enum projector_type {
99100
PROJECTOR_TYPE_GLM_EDGE,
100101
PROJECTOR_TYPE_MERGER,
101102
PROJECTOR_TYPE_GEMMA3,
103+
PROJECTOR_TYPE_IDEFICS3,
102104
PROJECTOR_TYPE_UNKNOWN,
103105
};
104106

@@ -110,6 +112,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
110112
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
111113
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
112114
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
115+
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
113116
};
114117

115118
static projector_type clip_projector_type_from_string(const std::string & str) {

0 commit comments

Comments
 (0)