40
40
#include < map>
41
41
#include < regex>
42
42
#include < stdexcept>
43
+ #include < unordered_set>
43
44
#include < vector>
44
45
#include < sstream>
45
46
#include < cinttypes>
@@ -120,6 +121,7 @@ static std::string format(const char * fmt, ...) {
120
121
#define KEY_IMAGE_MEAN " clip.vision.image_mean"
121
122
#define KEY_IMAGE_STD " clip.vision.image_std"
122
123
#define KEY_PROJ_TYPE " clip.projector_type"
124
+ #define KEY_FEATURE_LAYER " clip.vision.feature_layer"
123
125
124
126
#define KEY_MM_PATCH_MERGE_TYPE " clip.vision.mm_patch_merge_type"
125
127
#define KEY_IMAGE_GRID_PINPOINTS " clip.vision.image_grid_pinpoints"
@@ -444,8 +446,9 @@ struct clip_hparams {
444
446
445
447
char mm_patch_merge_type[32 ] = " flat" ; // spatial_unpad or flat (default)
446
448
447
- int32_t image_grid_pinpoints[ 32 ] ;
449
+ std::vector< int32_t > image_grid_pinpoints;
448
450
int32_t image_crop_resolution;
451
+ std::unordered_set<int32_t > vision_feature_layer;
449
452
};
450
453
451
454
struct clip_layer {
@@ -585,6 +588,7 @@ struct clip_ctx {
585
588
struct clip_vision_model vision_model;
586
589
projector_type proj_type = PROJECTOR_TYPE_MLP;
587
590
591
+ int32_t max_feature_layer;
588
592
float image_mean[3 ];
589
593
float image_std[3 ];
590
594
bool use_gelu = false ;
@@ -651,7 +655,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
651
655
const int hidden_size = hparams.hidden_size ;
652
656
const int n_head = hparams.n_head ;
653
657
const int d_head = hidden_size / n_head;
654
- int n_layer = hparams.n_layer ;
655
658
const float eps = hparams.eps ;
656
659
int mrope_sections[4 ] = {d_head/4 , d_head/4 , d_head/4 , d_head/4 };
657
660
@@ -752,13 +755,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
752
755
embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.pre_ln_w ), model.pre_ln_b );
753
756
}
754
757
758
+ std::vector<struct ggml_tensor *> embedding_stack;
759
+ const auto & vision_feature_layer = hparams.vision_feature_layer ;
760
+
755
761
// loop over layers
756
- if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger ) {
757
- n_layer += 1 ;
758
- }
759
- for (int il = 0 ; il < n_layer - 1 ; il++) {
762
+ for (int il = 0 ; il < ctx->max_feature_layer ; il++) {
760
763
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
761
764
765
+ // If this is an embedding feature layer, save the output.
766
+ // NOTE: 0 index here refers to the input to the encoder.
767
+ if (vision_feature_layer.find (il) != vision_feature_layer.end ()) {
768
+ embedding_stack.push_back (embeddings);
769
+ }
770
+
762
771
// const size_t nb_q_w = model.layers[il].q_w->nb[0];
763
772
764
773
// layernorm1
@@ -846,7 +855,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
846
855
cur = ggml_add (ctx0, embeddings, cur);
847
856
848
857
embeddings = cur;
849
-
850
858
}
851
859
852
860
// post-layernorm
@@ -857,6 +865,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
857
865
embeddings = ggml_add (ctx0, ggml_mul (ctx0, embeddings, model.post_ln_w ), model.post_ln_b );
858
866
}
859
867
868
+ // final layer is a vision feature layer
869
+ if (vision_feature_layer.find (ctx->max_feature_layer ) != vision_feature_layer.end ()) {
870
+ embedding_stack.push_back (embeddings);
871
+ }
872
+
873
+ // If feature layers are explicitly set, stack them (if we have multiple)
874
+ if (!embedding_stack.empty ()) {
875
+ embeddings = embedding_stack[0 ];
876
+ for (size_t i = 1 ; i < embedding_stack.size (); i++) {
877
+ embeddings = ggml_concat (ctx0, embeddings, embedding_stack[i], 0 );
878
+ }
879
+ }
880
+
860
881
// llava projector
861
882
if (ctx->has_llava_projector ) {
862
883
embeddings = ggml_reshape_2d (ctx0, embeddings, embeddings->ne [0 ], embeddings->ne [1 ]);
@@ -1443,14 +1464,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1443
1464
int idx = get_key_idx (ctx, KEY_IMAGE_GRID_PINPOINTS);
1444
1465
int n = gguf_get_arr_n (ctx, idx);
1445
1466
const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data (ctx, idx);
1446
- for (int i = 0 ; i < 32 && i < n && pinpoints[i] != 0 ; ++i) {
1447
- hparams.image_grid_pinpoints [i] = pinpoints[i];
1467
+ for (int i = 0 ; i < n ; ++i) {
1468
+ hparams.image_grid_pinpoints . push_back ( pinpoints[i]) ;
1448
1469
}
1449
- if (n < 32 )
1450
- hparams.image_grid_pinpoints [n] = 0 ;
1451
- } catch (std::runtime_error & /* e*/ ) {
1452
- hparams.image_grid_pinpoints [0 ]=0 ;
1453
- }
1470
+ } catch (std::runtime_error & /* e*/ ) { }
1471
+
1472
+ // Load the vision feature layer indices if they are explicitly provided;
1473
+ // if multiple vision feature layers are present, the values will be concatenated
1474
+ // to form the final visual features.
1475
+ // NOTE: gguf conversions should standardize the values of the vision feature layer to
1476
+ // be non-negative, since we use -1 to mark values as unset here.
1477
+ try {
1478
+ int idx = get_key_idx (ctx, KEY_FEATURE_LAYER);
1479
+ int n = gguf_get_arr_n (ctx, idx);
1480
+
1481
+ const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data (ctx, idx);
1482
+
1483
+ for (int i = 0 ; i < n; ++i) {
1484
+ hparams.vision_feature_layer .insert (vision_feature_layer[i]);
1485
+ }
1486
+ } catch (std::runtime_error & /* e*/ ) { }
1454
1487
1455
1488
try {
1456
1489
int idx = get_key_idx (ctx, KEY_MM_PATCH_MERGE_TYPE);
@@ -1476,6 +1509,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1476
1509
new_clip->image_std [i] = std_data[i];
1477
1510
}
1478
1511
1512
+ // Calculate the deepest feature layer based on hparams and projector type
1513
+ new_clip->max_feature_layer = get_deepest_feature_layer (new_clip);
1514
+
1479
1515
if (verbosity >= 2 ) {
1480
1516
LOG_INF (" \n %s: vision model hparams\n " , __func__);
1481
1517
LOG_INF (" image_size %d\n " , hparams.image_size );
@@ -1489,8 +1525,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1489
1525
LOG_INF (" v_image_mean %f %f %f\n " , new_clip->image_mean [0 ], new_clip->image_mean [1 ], new_clip->image_mean [2 ]);
1490
1526
LOG_INF (" v_image_std %f %f %f\n " , new_clip->image_std [0 ], new_clip->image_std [1 ], new_clip->image_std [2 ]);
1491
1527
LOG_INF (" v_image_grid_pinpoints: " );
1492
- for (int i = 0 ; i < 32 && (hparams.image_grid_pinpoints [i] != 0 ); ++i) {
1493
- LOG_INF (" %d " , hparams.image_grid_pinpoints [i]);
1528
+ for (const auto & pp : hparams.image_grid_pinpoints ) {
1529
+ LOG_INF (" %d " , pp);
1530
+ }
1531
+ LOG_INF (" \n " );
1532
+ LOG_INF (" v_vision_feature_layer: " );
1533
+ for (const auto & feature_layer: hparams.vision_feature_layer ) {
1534
+ LOG_INF (" %d " , feature_layer);
1494
1535
}
1495
1536
LOG_INF (" \n " );
1496
1537
LOG_INF (" v_mm_patch_merge_type: %s\n " , hparams.mm_patch_merge_type );
@@ -2235,10 +2276,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
2235
2276
}
2236
2277
}
2237
2278
} else {
2238
- if (params.image_grid_pinpoints [ 0 ] != 0 ) {
2279
+ if (! params.image_grid_pinpoints . empty () ) {
2239
2280
// "spatial_unpad" with "anyres" processing for llava-1.6
2240
2281
std::vector<std::pair<int , int >> possible_resolutions;
2241
- for (int i = 0 ; i < 32 && params.image_grid_pinpoints [i] != 0 ; i+=2 ) {
2282
+ for (size_t i = 0 ; i < params.image_grid_pinpoints . size () ; i+=2 ) {
2242
2283
possible_resolutions.push_back ({params.image_grid_pinpoints [i], params.image_grid_pinpoints [i+1 ]});
2243
2284
}
2244
2285
std::pair<int , int > best_resolution = select_best_resolution ({img->nx , img->ny }, possible_resolutions);
@@ -2404,7 +2445,14 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
2404
2445
}
2405
2446
2406
2447
const int32_t * clip_image_grid (const struct clip_ctx * ctx) {
2407
- return ctx->vision_model .hparams .image_grid_pinpoints ;
2448
+ if (ctx->vision_model .hparams .image_grid_pinpoints .size ()) {
2449
+ return &ctx->vision_model .hparams .image_grid_pinpoints .front ();
2450
+ }
2451
+ return nullptr ;
2452
+ }
2453
+
2454
+ size_t get_clip_image_grid_size (const struct clip_ctx * ctx) {
2455
+ return ctx->vision_model .hparams .image_grid_pinpoints .size ();
2408
2456
}
2409
2457
2410
2458
int clip_n_patches (const struct clip_ctx * ctx) {
@@ -2929,6 +2977,28 @@ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
2929
2977
return ctx->has_qwen2vl_merger ;
2930
2978
}
2931
2979
2980
+ // Determine the number of encoder layers to iterate over
2981
+ int get_deepest_feature_layer (const struct clip_ctx * ctx) {
2982
+ // Get the index of the second to last layer; this is the
2983
+ // default for models that have a llava projector
2984
+ const auto & hparams = ctx->vision_model .hparams ;
2985
+ int n_layer = hparams.n_layer - 1 ;
2986
+ int deepest_feature_layer = -1 ;
2987
+
2988
+ // Handle other projectors; incrementing here indicates that we
2989
+ // should use the last encoder layer for the vision features.
2990
+ if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger ) {
2991
+ n_layer += 1 ;
2992
+ }
2993
+
2994
+ // If we set explicit vision feature layers, only go up to the deepest one
2995
+ for (const auto & feature_layer : hparams.vision_feature_layer ) {
2996
+ if (feature_layer > deepest_feature_layer) {
2997
+ deepest_feature_layer = feature_layer;
2998
+ }
2999
+ }
3000
+ return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
3001
+ }
2932
3002
2933
3003
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
2934
3004
clip_image_f32 clip_img;
0 commit comments