@@ -3490,59 +3490,84 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
3490
3490
int clip_n_output_tokens (const struct clip_ctx * ctx, struct clip_image_f32 * img) {
3491
3491
const auto & params = ctx->model .hparams ;
3492
3492
3493
- int n_patches = (params.image_size / params.patch_size ) * (params.image_size / params.patch_size );
3494
- int scale_factor = ctx->model .hparams .proj_scale_factor ;
3495
-
3496
- if (ctx->proj_type () == PROJECTOR_TYPE_LDP
3497
- || ctx->proj_type () == PROJECTOR_TYPE_LDPV2
3498
- || ctx->proj_type () == PROJECTOR_TYPE_GLM_EDGE) {
3499
- n_patches /= 4 ;
3500
- if (ctx->model .mm_glm_tok_boi ) {
3501
- n_patches += 2 ; // for BOI and EOI token embeddings
3502
- }
3503
- } else if (ctx->proj_type () == PROJECTOR_TYPE_MINICPMV) {
3504
- if (params.minicpmv_version == 2 ) {
3505
- n_patches = 96 ;
3506
- }
3507
- else if (params.minicpmv_version == 3 ) {
3508
- n_patches = 64 ;
3509
- }
3510
- else if (params.minicpmv_version == 4 ) {
3511
- n_patches = 64 ;
3512
- }
3513
- else {
3514
- GGML_ABORT (" Unknown minicpmv version" );
3515
- }
3516
- } else if (ctx->proj_type () == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type () == PROJECTOR_TYPE_QWEN25VL) {
3517
- int patch_size = params.patch_size * 2 ;
3518
- int x_patch = img->nx / patch_size + (int )(img->nx % patch_size > 0 );
3519
- int y_patch = img->ny / patch_size + (int )(img->ny % patch_size > 0 );
3520
- n_patches = x_patch * y_patch;
3521
- } else if (ctx->proj_type () == PROJECTOR_TYPE_GEMMA3) {
3522
- int n_per_side = params.image_size / params.patch_size ;
3523
- int n_per_side_2d_pool = n_per_side / params.proj_scale_factor ;
3524
- n_patches = n_per_side_2d_pool * n_per_side_2d_pool;
3525
- } else if (ctx->proj_type () == PROJECTOR_TYPE_IDEFICS3 || ctx->proj_type () == PROJECTOR_TYPE_INTERNVL) {
3526
- // both W and H are divided by proj_scale_factor
3527
- n_patches /= (params.proj_scale_factor * params.proj_scale_factor );
3528
- } else if (ctx->proj_type () == PROJECTOR_TYPE_PIXTRAL) {
3529
- int n_merge = params.spatial_merge_size ;
3530
- int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1 );
3531
- int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1 );
3532
- n_patches = n_patches_y*n_patches_x + n_patches_y - 1 ; // + one [IMG_BREAK] per row, except the last row
3533
- } else if (ctx->proj_type () == PROJECTOR_TYPE_LLAMA4) {
3534
- n_patches /= (scale_factor * scale_factor);
3535
- } else if (ctx->proj_type () == PROJECTOR_TYPE_ULTRAVOX) {
3536
- const int proj_stack_factor = ctx->model .hparams .proj_stack_factor ;
3537
- const int n_len = CLIP_ALIGN (img->nx , proj_stack_factor);
3538
- n_patches = n_len / proj_stack_factor / 2 ;
3539
- } else if (ctx->proj_type () == PROJECTOR_TYPE_QWEN2A) {
3540
- // divide by 2 because of whisper
3541
- // another divide by 2 because of nn.AvgPool1d(2, stride=2)
3542
- n_patches = img->nx / 4 ;
3493
+ // only for models using fixed size square images
3494
+ int n_patches_sq = (params.image_size / params.patch_size ) * (params.image_size / params.patch_size );
3495
+
3496
+ projector_type proj = ctx->proj_type ();
3497
+
3498
+ switch (proj) {
3499
+ case PROJECTOR_TYPE_LDP:
3500
+ case PROJECTOR_TYPE_LDPV2:
3501
+ case PROJECTOR_TYPE_GLM_EDGE:
3502
+ {
3503
+ n_patches_sq /= 4 ;
3504
+ if (ctx->model .mm_glm_tok_boi ) {
3505
+ n_patches_sq += 2 ; // for BOI and EOI token embeddings
3506
+ }
3507
+ } break ;
3508
+ case PROJECTOR_TYPE_MINICPMV:
3509
+ {
3510
+ if (params.minicpmv_version == 2 ) {
3511
+ n_patches_sq = 96 ;
3512
+ } else if (params.minicpmv_version == 3 ) {
3513
+ n_patches_sq = 64 ;
3514
+ } else if (params.minicpmv_version == 4 ) {
3515
+ n_patches_sq = 64 ;
3516
+ } else {
3517
+ GGML_ABORT (" Unknown minicpmv version" );
3518
+ }
3519
+ } break ;
3520
+ case PROJECTOR_TYPE_QWEN2VL:
3521
+ case PROJECTOR_TYPE_QWEN25VL:
3522
+ {
3523
+ // dynamic size
3524
+ int patch_size = params.patch_size * 2 ;
3525
+ int x_patch = img->nx / patch_size + (int )(img->nx % patch_size > 0 );
3526
+ int y_patch = img->ny / patch_size + (int )(img->ny % patch_size > 0 );
3527
+ n_patches_sq = x_patch * y_patch;
3528
+ } break ;
3529
+ case PROJECTOR_TYPE_GEMMA3:
3530
+ {
3531
+ int n_per_side = params.image_size / params.patch_size ;
3532
+ int n_per_side_2d_pool = n_per_side / params.proj_scale_factor ;
3533
+ n_patches_sq = n_per_side_2d_pool * n_per_side_2d_pool;
3534
+ } break ;
3535
+ case PROJECTOR_TYPE_IDEFICS3:
3536
+ case PROJECTOR_TYPE_INTERNVL:
3537
+ {
3538
+ // both W and H are divided by proj_scale_factor
3539
+ n_patches_sq /= (params.proj_scale_factor * params.proj_scale_factor );
3540
+ } break ;
3541
+ case PROJECTOR_TYPE_PIXTRAL:
3542
+ {
3543
+ // dynamic size
3544
+ int n_merge = params.spatial_merge_size ;
3545
+ int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1 );
3546
+ int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1 );
3547
+ n_patches_sq = n_patches_y * n_patches_x + n_patches_y - 1 ; // + one [IMG_BREAK] per row, except the last row
3548
+ } break ;
3549
+ case PROJECTOR_TYPE_LLAMA4:
3550
+ {
3551
+ int scale_factor = ctx->model .hparams .proj_scale_factor ;
3552
+ n_patches_sq /= (scale_factor * scale_factor);
3553
+ } break ;
3554
+ case PROJECTOR_TYPE_ULTRAVOX:
3555
+ {
3556
+ const int proj_stack_factor = ctx->model .hparams .proj_stack_factor ;
3557
+ const int n_len = CLIP_ALIGN (img->nx , proj_stack_factor);
3558
+ n_patches_sq = n_len / proj_stack_factor / 2 ;
3559
+ } break ;
3560
+ case PROJECTOR_TYPE_QWEN2A:
3561
+ {
3562
+ // divide by 2 because of whisper
3563
+ // another divide by 2 because of nn.AvgPool1d(2, stride=2)
3564
+ n_patches_sq = img->nx / 4 ;
3565
+ } break ;
3566
+ default :
3567
+ GGML_ABORT (" unsupported projector type" );
3543
3568
}
3544
3569
3545
- return n_patches ;
3570
+ return n_patches_sq ;
3546
3571
}
3547
3572
3548
3573
static std::vector<std::vector<std::vector<float >>> get_1d_sincos_pos_embed_from_grid_new (int embed_dim, const std::vector<std::vector<float >> & pos) {
@@ -3747,7 +3772,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3747
3772
}
3748
3773
3749
3774
// set input per projector
3750
- switch (ctx->proj_type () ) {
3775
+ switch (ctx->model . proj_type ) {
3751
3776
case PROJECTOR_TYPE_MINICPMV:
3752
3777
{
3753
3778
// inspired from siglip:
@@ -4013,7 +4038,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
4013
4038
4014
4039
int clip_n_mmproj_embd (const struct clip_ctx * ctx) {
4015
4040
const auto & hparams = ctx->model .hparams ;
4016
- switch (ctx->proj_type () ) {
4041
+ switch (ctx->model . proj_type ) {
4017
4042
case PROJECTOR_TYPE_LDP:
4018
4043
return ctx->model .mm_model_block_1_block_2_1_b ->ne [0 ];
4019
4044
case PROJECTOR_TYPE_LDPV2:
0 commit comments