@@ -587,6 +587,7 @@ struct clip_ctx {
587
587
struct clip_vision_model vision_model;
588
588
projector_type proj_type = PROJECTOR_TYPE_MLP;
589
589
590
+ int32_t max_feature_layer;
590
591
float image_mean[3 ];
591
592
float image_std[3 ];
592
593
bool use_gelu = false ;
@@ -755,12 +756,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
755
756
}
756
757
757
758
std::vector<struct ggml_tensor *> embedding_stack;
758
- // Check to see if we have 1+ set vision feature layers set; otherwise it's determined
759
- // by the type of projector that this model has (usually last or second to last layer).
760
- int max_feature_layer = get_deepest_feature_layer (ctx);
761
759
762
760
// loop over layers
763
- for (int il = 0 ; il < max_feature_layer; il++) {
761
+ for (int il = 0 ; il < ctx-> max_feature_layer ; il++) {
764
762
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
765
763
766
764
// If this is an embedding feature layer, save the output.
@@ -862,7 +860,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
862
860
}
863
861
864
862
// post-layernorm
865
- if (ctx->has_post_norm && max_feature_layer == n_layer) {
863
+ if (ctx->has_post_norm && ctx-> max_feature_layer == n_layer) {
866
864
embeddings = ggml_norm (ctx0, embeddings, eps);
867
865
ggml_set_name (embeddings, " post_ln" );
868
866
@@ -1516,6 +1514,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1516
1514
new_clip->image_std [i] = std_data[i];
1517
1515
}
1518
1516
1517
+ // Calculate the deepest feature layer based on hparams and projector type
1518
+ new_clip->max_feature_layer = get_deepest_feature_layer (new_clip);
1519
+
1519
1520
if (verbosity >= 2 ) {
1520
1521
LOG_INF (" \n %s: vision model hparams\n " , __func__);
1521
1522
LOG_INF (" image_size %d\n " , hparams.image_size );
0 commit comments