Skip to content

Commit 9eb496b

Browse files
committed
also support tinygemma3 test model
1 parent b69401e commit 9eb496b

File tree

2 files changed

+38
-8
lines changed

2 files changed

+38
-8
lines changed

convert_hf_to_gguf.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3908,6 +3908,16 @@ def set_gguf_parameters(self):
39083908
# default values below are taken from HF tranformers code
39093909
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
39103910
self.gguf_writer.add_vision_use_gelu(True)
3911+
# calculate proj_scale_factor (used by tinygemma3 test model)
3912+
image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
3913+
n_per_side = int(image_seq_length ** 0.5)
3914+
image_size = self.hparams["image_size"]
3915+
patch_size = self.hparams["patch_size"]
3916+
proj_scale_factor = (image_size // patch_size) // n_per_side
3917+
if proj_scale_factor > 0 and proj_scale_factor != 4:
3918+
# we only need to write this if it's not the default value
3919+
# in this case, we are converting a test model
3920+
self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
39113921

39123922
def tensor_force_quant(self, name, new_name, bid, n_dims):
39133923
del bid, new_name, n_dims # unused
@@ -3921,6 +3931,9 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
39213931
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
39223932
del bid # unused
39233933

3934+
if "vision_model.head." in name:
3935+
return [] # skip redundant tensors for tinygemma3
3936+
39243937
if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
39253938
or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
39263939
# process vision tensors

tools/mtmd/clip.cpp

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -440,15 +440,14 @@ struct clip_graph {
440440

441441
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
442442
const int batch_size = 1;
443-
const int mm_tokens_per_image = 256; // default value for gemma3
444-
const int tokens_per_side = sqrt(mm_tokens_per_image);
445-
const int patches_per_image = sqrt(n_patches);
446-
const int kernel_size = patches_per_image / tokens_per_side;
443+
GGML_ASSERT(n_patches_x == n_patches_y);
444+
const int patches_per_image = n_patches_x;
445+
const int kernel_size = hparams.proj_scale_factor;
447446

448447
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
449448
cur = ggml_reshape_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
450449

451-
// doing a pool2d to reduce the number of output tokens to 256
450+
// doing a pool2d to reduce the number of output tokens
452451
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
453452
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
454453
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
@@ -1795,6 +1794,14 @@ struct clip_model_loader {
17951794
hparams.rope_theta = 10000.0f;
17961795
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
17971796
} break;
1797+
case PROJECTOR_TYPE_GEMMA3:
1798+
{
1799+
// default value (used by all model sizes in gemma 3 family)
1800+
// number of patches for each **side** is reduced by a factor of 4
1801+
hparams.proj_scale_factor = 4;
1802+
// test model (tinygemma3) has a different value, we optionally read it
1803+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
1804+
} break;
17981805
case PROJECTOR_TYPE_QWEN25VL:
17991806
{
18001807
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
@@ -1804,6 +1811,14 @@ struct clip_model_loader {
18041811
}
18051812

18061813
LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
1814+
LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd);
1815+
LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head);
1816+
LOG_INF("%s: n_ff: %d\n", __func__, hparams.n_ff);
1817+
LOG_INF("%s: n_layer: %d\n", __func__, hparams.n_layer);
1818+
LOG_INF("%s: projection_dim: %d\n", __func__, hparams.projection_dim);
1819+
LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size);
1820+
LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size);
1821+
LOG_INF("\n");
18071822
LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector);
18081823
LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
18091824
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
@@ -2990,11 +3005,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
29903005
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
29913006
n_patches = x_patch * y_patch;
29923007
} else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
2993-
n_patches = 256;
3008+
int n_per_side = params.image_size / params.patch_size;
3009+
int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
3010+
n_patches = n_per_side_2d_pool * n_per_side_2d_pool;
29943011
} else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
2995-
n_patches /= ctx->vision_model.hparams.proj_scale_factor;
3012+
n_patches /= params.proj_scale_factor;
29963013
} else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
2997-
int n_merge = ctx->vision_model.hparams.spatial_merge_size;
3014+
int n_merge = params.spatial_merge_size;
29983015
int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
29993016
int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
30003017
n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row

0 commit comments

Comments
 (0)