Skip to content

Commit 32916a4

Browse files
authored
clip : refactor graph builder (#13321)
* mtmd : refactor graph builder * fix qwen2vl * clean up siglip cgraph * pixtral migrated * move minicpmv to a dedicated build function * move max_feature_layer to build_llava * use build_attn for minicpm resampler * fix windows build * add comment for batch_size * also support tinygemma3 test model * qwen2vl does not use RMS norm * fix qwen2vl norm (2)
1 parent ffc7272 commit 32916a4

File tree

2 files changed

+1236
-1191
lines changed

2 files changed

+1236
-1191
lines changed

convert_hf_to_gguf.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3915,6 +3915,16 @@ def set_gguf_parameters(self):
39153915
# default values below are taken from HF tranformers code
39163916
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
39173917
self.gguf_writer.add_vision_use_gelu(True)
3918+
# calculate proj_scale_factor (used by tinygemma3 test model)
3919+
image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
3920+
n_per_side = int(image_seq_length ** 0.5)
3921+
image_size = self.hparams["image_size"]
3922+
patch_size = self.hparams["patch_size"]
3923+
proj_scale_factor = (image_size // patch_size) // n_per_side
3924+
if proj_scale_factor > 0 and proj_scale_factor != 4:
3925+
# we only need to write this if it's not the default value
3926+
# in this case, we are converting a test model
3927+
self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
39183928

39193929
def tensor_force_quant(self, name, new_name, bid, n_dims):
39203930
del bid, new_name, n_dims # unused
@@ -3928,6 +3938,9 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
39283938
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
39293939
del bid # unused
39303940

3941+
if "vision_model.head." in name:
3942+
return [] # skip redundant tensors for tinygemma3
3943+
39313944
if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
39323945
or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
39333946
# process vision tensors

0 commit comments

Comments
 (0)