@@ -187,7 +187,7 @@ struct clip_hparams {
187
187
float eps = 1e-6 ;
188
188
float rope_theta = 0.0 ;
189
189
190
- std::vector<int32_t > image_grid_pinpoints;
190
+ std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
191
191
int32_t image_crop_resolution;
192
192
std::unordered_set<int32_t > vision_feature_layer;
193
193
int32_t attn_window_size = 0 ;
@@ -2109,8 +2109,7 @@ struct clip_model_loader {
2109
2109
if (is_vision) {
2110
2110
get_u32 (KEY_IMAGE_SIZE, hparams.image_size );
2111
2111
get_u32 (KEY_PATCH_SIZE, hparams.patch_size );
2112
- get_u32 (KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution , false );
2113
- get_arr_int (KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints , false );
2112
+ get_u32 (KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution , false );
2114
2113
get_i32 (KEY_MINICPMV_VERSION, hparams.minicpmv_version , false ); // legacy
2115
2114
2116
2115
} else if (is_audio) {
@@ -2120,6 +2119,20 @@ struct clip_model_loader {
2120
2119
GGML_ASSERT (false && " unknown modality" );
2121
2120
}
2122
2121
2122
+ // for pinpoints, we need to convert it into a list of resolution candidates
2123
+ {
2124
+ std::vector<int > pinpoints;
2125
+ get_arr_int (KEY_IMAGE_GRID_PINPOINTS, pinpoints, false );
2126
+ if (!pinpoints.empty ()) {
2127
+ for (size_t i = 0 ; i < pinpoints.size (); i += 2 ) {
2128
+ hparams.image_res_candidates .push_back ({
2129
+ pinpoints[i],
2130
+ pinpoints[i+1 ],
2131
+ });
2132
+ }
2133
+ }
2134
+ }
2135
+
2123
2136
// default warmup value
2124
2137
hparams.warmup_image_size = hparams.image_size ;
2125
2138
@@ -2231,16 +2244,7 @@ struct clip_model_loader {
2231
2244
{
2232
2245
hparams.rope_theta = 10000 .0f ;
2233
2246
get_u32 (KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor );
2234
-
2235
- // borrowed from llava-1.6
2236
- const int isize = hparams.image_size ;
2237
- hparams.image_grid_pinpoints = {
2238
- isize, isize*2 , // 336, 672
2239
- isize*2 , isize, // 672, 336
2240
- isize*2 , isize*2 , // 672, 672
2241
- isize*3 , isize, // 1008, 336
2242
- isize, isize*3 , // 336, 1008
2243
- };
2247
+ set_llava_uhd_res_candidates (model, 3 );
2244
2248
} break ;
2245
2249
case PROJECTOR_TYPE_ULTRAVOX:
2246
2250
case PROJECTOR_TYPE_QWEN2A:
@@ -2674,6 +2678,21 @@ struct clip_model_loader {
2674
2678
output[i] = values[i];
2675
2679
}
2676
2680
}
2681
+
2682
+ void set_llava_uhd_res_candidates (clip_model & model, const int max_patches_per_side) {
2683
+ auto & hparams = model.hparams ;
2684
+ for (int x = 1 ; x <= max_patches_per_side; x++) {
2685
+ for (int y = 1 ; y <= max_patches_per_side; y++) {
2686
+ if (x == 1 && y == 1 ) {
2687
+ continue ; // skip the first point
2688
+ }
2689
+ hparams.image_res_candidates .push_back (clip_image_size{
2690
+ x*hparams.image_size ,
2691
+ y*hparams.image_size ,
2692
+ });
2693
+ }
2694
+ }
2695
+ }
2677
2696
};
2678
2697
2679
2698
struct clip_init_result clip_init (const char * fname, struct clip_context_params ctx_params) {
@@ -3028,36 +3047,41 @@ struct llava_uhd {
3028
3047
bool padding_refined = false ; // if true, refine image will be padded to the grid size (e.g. llava-1.6)
3029
3048
};
3030
3049
3031
- static int get_max_slices (struct clip_ctx * ctx) {
3032
- if (clip_is_minicpmv (ctx)) {
3033
- return 9 ;
3034
- }
3035
- return 0 ;
3036
- }
3037
-
3038
3050
static slice_instructions get_slice_instructions (struct clip_ctx * ctx, const clip_image_size & original_size) {
3039
3051
slice_instructions res;
3040
3052
const int patch_size = clip_get_patch_size (ctx);
3041
3053
const int slice_size = clip_get_image_size (ctx);
3042
- const int max_slice_nums = get_max_slices (ctx);
3043
3054
const int original_width = original_size.width ;
3044
3055
const int original_height = original_size.height ;
3045
- const float log_ratio = log ((float )original_width / original_height);
3046
- const float ratio = (float )original_width * original_height / (slice_size * slice_size);
3047
- const int multiple = fmin (ceil (ratio), max_slice_nums);
3048
- const bool has_slices = (multiple > 1 );
3049
- const bool has_pinpoints = !ctx->model .hparams .image_grid_pinpoints .empty ();
3056
+
3057
+ const bool has_slices = original_size.width > slice_size || original_size.height > slice_size;
3058
+ const bool has_pinpoints = !ctx->model .hparams .image_res_candidates .empty ();
3059
+
3060
+ if (!has_slices) {
3061
+ // skip slicing logic
3062
+ res.overview_size = clip_image_size{slice_size, slice_size};
3063
+ res.refined_size = clip_image_size{0 , 0 };
3064
+ res.grid_size = clip_image_size{0 , 0 };
3065
+
3066
+ return res;
3067
+ }
3050
3068
3051
3069
if (has_pinpoints) {
3052
3070
// has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
3053
3071
auto refine_size = llava_uhd::select_best_resolution (
3054
- ctx-> model . hparams . image_grid_pinpoints ,
3055
- original_size );
3072
+ original_size ,
3073
+ ctx-> model . hparams . image_res_candidates );
3056
3074
res.overview_size = clip_image_size{slice_size, slice_size};
3057
3075
res.refined_size = refine_size;
3058
3076
res.grid_size = clip_image_size{0 , 0 };
3059
3077
res.padding_refined = true ;
3060
3078
3079
+ LOG_DBG (" %s: using pinpoints for slicing\n " , __func__);
3080
+ LOG_DBG (" %s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n " ,
3081
+ __func__, original_width, original_height,
3082
+ res.overview_size .width , res.overview_size .height ,
3083
+ res.refined_size .width , res.refined_size .height );
3084
+
3061
3085
for (int y = 0 ; y < refine_size.height ; y += slice_size) {
3062
3086
for (int x = 0 ; x < refine_size.width ; x += slice_size) {
3063
3087
slice_coordinates slice;
@@ -3066,13 +3090,16 @@ struct llava_uhd {
3066
3090
slice.size .width = std::min (slice_size, refine_size.width - x);
3067
3091
slice.size .height = std::min (slice_size, refine_size.height - y);
3068
3092
res.slices .push_back (slice);
3069
- if (x == 0 ) {
3070
- res.grid_size . width ++;
3071
- }
3093
+ LOG_DBG ( " %s: slice %d: x=%d, y=%d, size=%dx%d \n " ,
3094
+ __func__, ( int ) res.slices . size () - 1 ,
3095
+ slice. x , slice. y , slice. size . width , slice. size . height );
3072
3096
}
3073
- res.grid_size .height ++;
3074
3097
}
3075
3098
3099
+ res.grid_size .height = refine_size.height / slice_size;
3100
+ res.grid_size .width = refine_size.width / slice_size;
3101
+ LOG_DBG (" %s: grid size: %d x %d\n " , __func__, res.grid_size .width , res.grid_size .height );
3102
+
3076
3103
return res;
3077
3104
}
3078
3105
@@ -3081,17 +3108,23 @@ struct llava_uhd {
3081
3108
auto best_size = get_best_resize (original_size, slice_size, patch_size, !has_slices);
3082
3109
res.overview_size = best_size;
3083
3110
3084
- if (!has_slices) {
3085
- // skip slicing logic
3086
- res.refined_size = clip_image_size{0 , 0 };
3087
- res.grid_size = clip_image_size{0 , 0 };
3111
+ {
3112
+ const int max_slice_nums = 9 ; // TODO: this is only used by minicpmv, maybe remove it
3113
+ const float log_ratio = log ((float )original_width / original_height);
3114
+ const float ratio = (float )original_width * original_height / (slice_size * slice_size);
3115
+ const int multiple = fmin (ceil (ratio), max_slice_nums);
3088
3116
3089
- } else {
3090
3117
auto best_grid = get_best_grid (max_slice_nums, multiple, log_ratio);
3091
3118
auto refine_size = get_refine_size (original_size, best_grid, slice_size, patch_size, true );
3092
3119
res.grid_size = best_grid;
3093
3120
res.refined_size = refine_size;
3094
3121
3122
+ LOG_DBG (" %s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n " ,
3123
+ __func__, original_width, original_height,
3124
+ res.overview_size .width , res.overview_size .height ,
3125
+ res.refined_size .width , res.refined_size .height ,
3126
+ res.grid_size .width , res.grid_size .height );
3127
+
3095
3128
int width = refine_size.width ;
3096
3129
int height = refine_size.height ;
3097
3130
int grid_x = int (width / best_grid.width );
@@ -3108,7 +3141,9 @@ struct llava_uhd {
3108
3141
slice.size .width = grid_x;
3109
3142
slice.size .height = grid_y;
3110
3143
res.slices .push_back (slice);
3111
- // LOG_INF("slice %d: %d %d %d %d\n", ic, patches_i, patches_j, grid_x, grid_y);
3144
+ LOG_DBG (" %s: slice %d: x=%d, y=%d, size=%dx%d\n " ,
3145
+ __func__, (int )res.slices .size () - 1 ,
3146
+ slice.x , slice.y , slice.size .width , slice.size .height );
3112
3147
}
3113
3148
}
3114
3149
}
@@ -3166,48 +3201,55 @@ struct llava_uhd {
3166
3201
return res;
3167
3202
}
3168
3203
3204
+ static clip_image_size resize_maintain_aspect_ratio (const clip_image_size & orig, const clip_image_size & target_max) {
3205
+ float scale_width = static_cast <float >(target_max.width ) / orig.width ;
3206
+ float scale_height = static_cast <float >(target_max.height ) / orig.height ;
3207
+ float scale = std::min (scale_width, scale_height);
3208
+ return clip_image_size{
3209
+ static_cast <int >(orig.width * scale),
3210
+ static_cast <int >(orig.height * scale),
3211
+ };
3212
+ }
3213
+
3169
3214
/* *
3170
3215
* Selects the best resolution from a list of possible resolutions based on the original size.
3171
3216
*
3217
+ * For example, when given a list of resolutions:
3218
+ * - 100x100
3219
+ * - 200x100
3220
+ * - 100x200
3221
+ * - 200x200
3222
+ *
3223
+ * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
3224
+ *
3172
3225
* @param original_size The original size of the image
3173
3226
* @param possible_resolutions A list of possible resolutions
3174
3227
* @return The best fit resolution
3175
3228
*/
3176
3229
static clip_image_size select_best_resolution (const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
3177
- int original_width = original_size.width ;
3178
- int original_height = original_size.height ;
3179
3230
clip_image_size best_fit;
3231
+ int min_wasted_area = std::numeric_limits<int >::max ();
3180
3232
int max_effective_resolution = 0 ;
3181
- int min_wasted_resolution = std::numeric_limits<int >::max ();
3182
-
3183
- for (const auto & resolution : possible_resolutions) {
3184
- int width = resolution.width ;
3185
- int height = resolution.height ;
3186
- float scale = std::min (static_cast <float >(width) / original_width, static_cast <float >(height) / original_height);
3187
- int downscaled_width = static_cast <int >(original_width * scale);
3188
- int downscaled_height = static_cast <int >(original_height * scale);
3189
- int effective_resolution = std::min (downscaled_width * downscaled_height, original_width * original_height);
3190
- int wasted_resolution = (width * height) - effective_resolution;
3191
- // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
3192
- if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
3233
+
3234
+ for (const clip_image_size & candidate : possible_resolutions) {
3235
+ auto target_size = resize_maintain_aspect_ratio (original_size, candidate);
3236
+ int effective_resolution = std::min (
3237
+ target_size.width * target_size.height ,
3238
+ original_size.width * original_size.height );
3239
+ int wasted_area = (candidate.width * candidate.height ) - effective_resolution;
3240
+
3241
+ if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) {
3193
3242
max_effective_resolution = effective_resolution;
3194
- min_wasted_resolution = wasted_resolution ;
3195
- best_fit = resolution ;
3243
+ min_wasted_area = wasted_area ;
3244
+ best_fit = candidate ;
3196
3245
}
3246
+
3247
+ LOG_DBG (" %s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n " , __func__, candidate.width , candidate.height , target_size.width , target_size.height , wasted_area, effective_resolution);
3197
3248
}
3198
3249
3199
3250
return best_fit;
3200
3251
}
3201
3252
3202
- // used by llava 1.6 with custom list of pinpoints
3203
- static clip_image_size select_best_resolution (const std::vector<int32_t > & pinpoints, const clip_image_size & original_size) {
3204
- std::vector<clip_image_size> possible_resolutions; // TODO @ngxson : construct this inside hparams, not here
3205
- for (size_t i = 0 ; i < pinpoints.size (); i += 2 ) {
3206
- possible_resolutions.push_back (clip_image_size{pinpoints[i], pinpoints[i+1 ]});
3207
- }
3208
- return select_best_resolution (original_size, possible_resolutions);
3209
- }
3210
-
3211
3253
static int ensure_divide (int length, int patch_size) {
3212
3254
return std::max (static_cast <int >(std::round (static_cast <float >(length) / patch_size) * patch_size), patch_size);
3213
3255
}
@@ -3331,7 +3373,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
3331
3373
return true ;
3332
3374
3333
3375
} else if (ctx->proj_type () == PROJECTOR_TYPE_LLAMA4) {
3334
- GGML_ASSERT (!params.image_grid_pinpoints .empty ());
3376
+ GGML_ASSERT (!params.image_res_candidates .empty ());
3335
3377
auto const inst = llava_uhd::get_slice_instructions (ctx, original_size);
3336
3378
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image (img, inst);
3337
3379
@@ -3371,7 +3413,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
3371
3413
res_imgs->entries .push_back (std::move (res));
3372
3414
return true ;
3373
3415
3374
- } else if (!params.image_grid_pinpoints .empty ()) {
3416
+ } else if (!params.image_res_candidates .empty ()) {
3375
3417
// "spatial_unpad" with "anyres" processing for llava-1.6
3376
3418
auto const inst = llava_uhd::get_slice_instructions (ctx, original_size);
3377
3419
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image (img, inst);
@@ -3431,17 +3473,6 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
3431
3473
return ctx->model .hparams .mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? " spatial_unpad" : " flat" ;
3432
3474
}
3433
3475
3434
- const int32_t * clip_image_grid (const struct clip_ctx * ctx) {
3435
- if (ctx->model .hparams .image_grid_pinpoints .size ()) {
3436
- return &ctx->model .hparams .image_grid_pinpoints .front ();
3437
- }
3438
- return nullptr ;
3439
- }
3440
-
3441
- size_t get_clip_image_grid_size (const struct clip_ctx * ctx) {
3442
- return ctx->model .hparams .image_grid_pinpoints .size ();
3443
- }
3444
-
3445
3476
int clip_n_output_tokens_x (const struct clip_ctx * ctx, struct clip_image_f32 * img) {
3446
3477
const auto & params = ctx->model .hparams ;
3447
3478
const int n_total = clip_n_output_tokens (ctx, img);
0 commit comments