@@ -440,15 +440,14 @@ struct clip_graph {
440
440
441
441
if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
442
442
const int batch_size = 1 ;
443
- const int mm_tokens_per_image = 256 ; // default value for gemma3
444
- const int tokens_per_side = sqrt (mm_tokens_per_image);
445
- const int patches_per_image = sqrt (n_patches);
446
- const int kernel_size = patches_per_image / tokens_per_side;
443
+ GGML_ASSERT (n_patches_x == n_patches_y);
444
+ const int patches_per_image = n_patches_x;
445
+ const int kernel_size = hparams.proj_scale_factor ;
447
446
448
447
cur = ggml_cont (ctx0, ggml_transpose (ctx0, cur));
449
448
cur = ggml_reshape_4d (ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
450
449
451
- // doing a pool2d to reduce the number of output tokens to 256
450
+ // doing a pool2d to reduce the number of output tokens
452
451
cur = ggml_pool_2d (ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0 , 0 );
453
452
cur = ggml_reshape_3d (ctx0, cur, cur->ne [0 ] * cur->ne [0 ], n_embd, batch_size);
454
453
cur = ggml_cont (ctx0, ggml_transpose (ctx0, cur));
@@ -1795,6 +1794,14 @@ struct clip_model_loader {
1795
1794
hparams.rope_theta = 10000 .0f ;
1796
1795
get_u32 (KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size , false );
1797
1796
} break ;
1797
+ case PROJECTOR_TYPE_GEMMA3:
1798
+ {
1799
+ // default value (used by all model sizes in gemma 3 family)
1800
+ // number of patches for each **side** is reduced by a factor of 4
1801
+ hparams.proj_scale_factor = 4 ;
1802
+ // test model (tinygemma3) has a different value, we optionally read it
1803
+ get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
1804
+ } break ;
1798
1805
case PROJECTOR_TYPE_QWEN25VL:
1799
1806
{
1800
1807
get_u32 (KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern );
@@ -1804,6 +1811,14 @@ struct clip_model_loader {
1804
1811
}
1805
1812
1806
1813
LOG_INF (" %s: projector: %s\n " , __func__, proj_type.c_str ());
1814
+ LOG_INF (" %s: n_embd: %d\n " , __func__, hparams.n_embd );
1815
+ LOG_INF (" %s: n_head: %d\n " , __func__, hparams.n_head );
1816
+ LOG_INF (" %s: n_ff: %d\n " , __func__, hparams.n_ff );
1817
+ LOG_INF (" %s: n_layer: %d\n " , __func__, hparams.n_layer );
1818
+ LOG_INF (" %s: projection_dim: %d\n " , __func__, hparams.projection_dim );
1819
+ LOG_INF (" %s: image_size: %d\n " , __func__, hparams.image_size );
1820
+ LOG_INF (" %s: patch_size: %d\n " , __func__, hparams.patch_size );
1821
+ LOG_INF (" \n " );
1807
1822
LOG_INF (" %s: has_llava_proj: %d\n " , __func__, ctx_clip.has_llava_projector );
1808
1823
LOG_INF (" %s: minicpmv_version: %d\n " , __func__, ctx_clip.minicpmv_version );
1809
1824
LOG_INF (" %s: proj_scale_factor: %d\n " , __func__, hparams.proj_scale_factor );
@@ -2990,11 +3005,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
2990
3005
int y_patch = img->ny / patch_size + (int )(img->ny % patch_size > 0 );
2991
3006
n_patches = x_patch * y_patch;
2992
3007
} else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
2993
- n_patches = 256 ;
3008
+ int n_per_side = params.image_size / params.patch_size ;
3009
+ int n_per_side_2d_pool = n_per_side / params.proj_scale_factor ;
3010
+ n_patches = n_per_side_2d_pool * n_per_side_2d_pool;
2994
3011
} else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
2995
- n_patches /= ctx-> vision_model . hparams .proj_scale_factor ;
3012
+ n_patches /= params .proj_scale_factor ;
2996
3013
} else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
2997
- int n_merge = ctx-> vision_model . hparams .spatial_merge_size ;
3014
+ int n_merge = params .spatial_merge_size ;
2998
3015
int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1 );
2999
3016
int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1 );
3000
3017
n_patches = n_patches_y*n_patches_x + n_patches_y - 1 ; // + one [IMG_BREAK] per row, except the last row
0 commit comments