@@ -174,6 +174,10 @@ struct clip_hparams {
174
174
int32_t n_layer;
175
175
int32_t proj_scale_factor = 0 ; // idefics3
176
176
177
+ // for models using dynamic image size, we need to have a smaller image size to warmup
178
+ // otherwise, user will get OOM everytime they load the model
179
+ int32_t warmup_image_size = 0 ;
180
+
177
181
ffn_op_type ffn_op = FFN_GELU;
178
182
179
183
patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
@@ -1796,6 +1800,9 @@ struct clip_model_loader {
1796
1800
get_u32 (KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution , false );
1797
1801
get_arr_int (KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints , false );
1798
1802
1803
+ // default warmup value
1804
+ hparams.warmup_image_size = hparams.image_size ;
1805
+
1799
1806
ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP
1800
1807
|| ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM
1801
1808
|| ctx_clip.proj_type == PROJECTOR_TYPE_LDP
@@ -1870,6 +1877,7 @@ struct clip_model_loader {
1870
1877
case PROJECTOR_TYPE_PIXTRAL:
1871
1878
{
1872
1879
hparams.rope_theta = 10000 .0f ;
1880
+ hparams.warmup_image_size = hparams.patch_size * 8 ;
1873
1881
get_u32 (KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size , false );
1874
1882
} break ;
1875
1883
case PROJECTOR_TYPE_GEMMA3:
@@ -1880,8 +1888,19 @@ struct clip_model_loader {
1880
1888
// test model (tinygemma3) has a different value, we optionally read it
1881
1889
get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor , false );
1882
1890
} break ;
1891
+ case PROJECTOR_TYPE_QWEN2VL:
1892
+ {
1893
+ // max image size = sqrt(max_pixels)
1894
+ // https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
1895
+ hparams.image_size = 3584 ;
1896
+ hparams.warmup_image_size = hparams.patch_size * 8 ;
1897
+ } break ;
1883
1898
case PROJECTOR_TYPE_QWEN25VL:
1884
1899
{
1900
+ // max image size = sqrt(max_pixels)
1901
+ // https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
1902
+ hparams.image_size = 3584 ;
1903
+ hparams.warmup_image_size = hparams.patch_size * 8 ;
1885
1904
get_u32 (KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern );
1886
1905
} break ;
1887
1906
default :
@@ -2185,13 +2204,14 @@ struct clip_model_loader {
2185
2204
// create a fake batch
2186
2205
clip_image_f32_batch batch;
2187
2206
clip_image_f32_ptr img (clip_image_f32_init ());
2188
- img->nx = ctx_clip.vision_model .hparams .image_size ;
2189
- img->ny = ctx_clip.vision_model .hparams .image_size ;
2207
+ img->nx = ctx_clip.vision_model .hparams .warmup_image_size ;
2208
+ img->ny = ctx_clip.vision_model .hparams .warmup_image_size ;
2190
2209
img->buf .resize (img->nx * img->ny * 3 );
2191
2210
batch.entries .push_back (std::move (img));
2192
2211
2193
2212
ggml_cgraph * gf = clip_image_build_graph (&ctx_clip, batch);
2194
2213
ggml_backend_sched_reserve (ctx_clip.sched .get (), gf);
2214
+
2195
2215
for (size_t i = 0 ; i < ctx_clip.backend_ptrs .size (); ++i) {
2196
2216
ggml_backend_t backend = ctx_clip.backend_ptrs [i];
2197
2217
ggml_backend_buffer_type_t buft = ctx_clip.backend_buft [i];
@@ -2590,8 +2610,8 @@ struct image_manipulation {
2590
2610
float target_width_f = static_cast <float >(inp_size.width ) * scale;
2591
2611
float target_height_f = static_cast <float >(inp_size.height ) * scale;
2592
2612
2593
- int aligned_width = GGML_PAD ((int )target_width_f, align_size);
2594
- int aligned_height = GGML_PAD ((int )target_height_f, align_size);
2613
+ int aligned_width = CLIP_ALIGN ((int )target_width_f, align_size);
2614
+ int aligned_height = CLIP_ALIGN ((int )target_height_f, align_size);
2595
2615
2596
2616
return {aligned_width, aligned_height};
2597
2617
}
@@ -2910,10 +2930,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
2910
2930
}
2911
2931
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
2912
2932
clip_image_u8 resized;
2913
- auto patch_size = clip_get_patch_size (ctx) * 2 ;
2914
- int nx = ceil ((float )img->nx / patch_size) * patch_size;
2915
- int ny = ceil ((float )img->ny / patch_size) * patch_size;
2916
- image_manipulation::bicubic_resize (*img, resized, nx, ny);
2933
+ auto patch_size = params.patch_size * 2 ;
2934
+ auto new_size = image_manipulation::calc_size_preserved_ratio (original_size, patch_size, params.image_size );
2935
+ image_manipulation::bicubic_resize (*img, resized, new_size.width , new_size.height );
2917
2936
2918
2937
clip_image_f32_ptr img_f32 (clip_image_f32_init ());
2919
2938
// clip_image_f32_ptr res(clip_image_f32_init());
0 commit comments