Skip to content

Commit 15e6125

Browse files
authored
mtmd : add hard limit on image resolution for qwen2vl / qwen2.5vl (#13434)
* mtmd : add hard limit on image resolution for qwen2vl / qwen2.5vl * fix typo
1 parent 3b24d26 commit 15e6125

File tree

2 files changed

+30
-8
lines changed

2 files changed

+30
-8
lines changed

tools/mtmd/clip-impl.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,9 @@
9292
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
9393
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
9494

95+
// align x to upper multiple of n
96+
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
97+
9598
enum projector_type {
9699
PROJECTOR_TYPE_MLP,
97100
PROJECTOR_TYPE_MLP_NORM,

tools/mtmd/clip.cpp

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,10 @@ struct clip_hparams {
174174
int32_t n_layer;
175175
int32_t proj_scale_factor = 0; // idefics3
176176

177+
// for models using dynamic image size, we need to have a smaller image size to warmup
178+
// otherwise, user will get OOM everytime they load the model
179+
int32_t warmup_image_size = 0;
180+
177181
ffn_op_type ffn_op = FFN_GELU;
178182

179183
patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
@@ -1796,6 +1800,9 @@ struct clip_model_loader {
17961800
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
17971801
get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
17981802

1803+
// default warmup value
1804+
hparams.warmup_image_size = hparams.image_size;
1805+
17991806
ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP
18001807
|| ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM
18011808
|| ctx_clip.proj_type == PROJECTOR_TYPE_LDP
@@ -1870,6 +1877,7 @@ struct clip_model_loader {
18701877
case PROJECTOR_TYPE_PIXTRAL:
18711878
{
18721879
hparams.rope_theta = 10000.0f;
1880+
hparams.warmup_image_size = hparams.patch_size * 8;
18731881
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
18741882
} break;
18751883
case PROJECTOR_TYPE_GEMMA3:
@@ -1880,8 +1888,19 @@ struct clip_model_loader {
18801888
// test model (tinygemma3) has a different value, we optionally read it
18811889
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
18821890
} break;
1891+
case PROJECTOR_TYPE_QWEN2VL:
1892+
{
1893+
// max image size = sqrt(max_pixels)
1894+
// https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
1895+
hparams.image_size = 3584;
1896+
hparams.warmup_image_size = hparams.patch_size * 8;
1897+
} break;
18831898
case PROJECTOR_TYPE_QWEN25VL:
18841899
{
1900+
// max image size = sqrt(max_pixels)
1901+
// https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
1902+
hparams.image_size = 3584;
1903+
hparams.warmup_image_size = hparams.patch_size * 8;
18851904
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
18861905
} break;
18871906
default:
@@ -2185,13 +2204,14 @@ struct clip_model_loader {
21852204
// create a fake batch
21862205
clip_image_f32_batch batch;
21872206
clip_image_f32_ptr img(clip_image_f32_init());
2188-
img->nx = ctx_clip.vision_model.hparams.image_size;
2189-
img->ny = ctx_clip.vision_model.hparams.image_size;
2207+
img->nx = ctx_clip.vision_model.hparams.warmup_image_size;
2208+
img->ny = ctx_clip.vision_model.hparams.warmup_image_size;
21902209
img->buf.resize(img->nx * img->ny * 3);
21912210
batch.entries.push_back(std::move(img));
21922211

21932212
ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
21942213
ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
2214+
21952215
for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
21962216
ggml_backend_t backend = ctx_clip.backend_ptrs[i];
21972217
ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
@@ -2590,8 +2610,8 @@ struct image_manipulation {
25902610
float target_width_f = static_cast<float>(inp_size.width) * scale;
25912611
float target_height_f = static_cast<float>(inp_size.height) * scale;
25922612

2593-
int aligned_width = GGML_PAD((int)target_width_f, align_size);
2594-
int aligned_height = GGML_PAD((int)target_height_f, align_size);
2613+
int aligned_width = CLIP_ALIGN((int)target_width_f, align_size);
2614+
int aligned_height = CLIP_ALIGN((int)target_height_f, align_size);
25952615

25962616
return {aligned_width, aligned_height};
25972617
}
@@ -2910,10 +2930,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
29102930
}
29112931
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
29122932
clip_image_u8 resized;
2913-
auto patch_size = clip_get_patch_size(ctx) * 2;
2914-
int nx = ceil((float)img->nx / patch_size) * patch_size;
2915-
int ny = ceil((float)img->ny / patch_size) * patch_size;
2916-
image_manipulation::bicubic_resize(*img, resized, nx, ny);
2933+
auto patch_size = params.patch_size * 2;
2934+
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
2935+
image_manipulation::bicubic_resize(*img, resized, new_size.width, new_size.height);
29172936

29182937
clip_image_f32_ptr img_f32(clip_image_f32_init());
29192938
// clip_image_f32_ptr res(clip_image_f32_init());

0 commit comments

Comments
 (0)