@@ -770,8 +770,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
770
770
771
771
// If this is an embedding feature layer, save the output.
772
772
// NOTE: 0 index here refers to the input to the encoder.
773
- for (int vf_layer_idx = 0 ; vf_layer_idx < MAX_IMAGE_FEATURE_LAYERS; vf_layer_idx ++) {
774
- if (il == ctx->vision_model .hparams .vision_feature_layer [vf_layer_idx ]) {
773
+ for (int vl_idx = 0 ; vl_idx < MAX_IMAGE_FEATURE_LAYERS && (hparams. vision_feature_layer [vl_idx] > 0 ); vl_idx ++) {
774
+ if (il == ctx->vision_model .hparams .vision_feature_layer [vl_idx ]) {
775
775
embedding_stack.push_back (embeddings);
776
776
break ;
777
777
}
@@ -875,8 +875,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
875
875
}
876
876
877
877
// final layer is a vision feature layer
878
- for (int vf_layer_idx = 0 ; vf_layer_idx < MAX_IMAGE_FEATURE_LAYERS; vf_layer_idx ++) {
879
- if (n_layer == ctx->vision_model .hparams .vision_feature_layer [vf_layer_idx ]) {
878
+ for (int vl_idx = 0 ; vl_idx < MAX_IMAGE_FEATURE_LAYERS && (hparams. vision_feature_layer [vl_idx] > 0 ); vl_idx ++) {
879
+ if (n_layer == ctx->vision_model .hparams .vision_feature_layer [vl_idx ]) {
880
880
embedding_stack.push_back (embeddings);
881
881
break ;
882
882
}
@@ -2991,7 +2991,8 @@ size_t get_max_image_grid_pinpoints() {
2991
2991
int get_deepest_feature_layer (const struct clip_ctx * ctx) {
2992
2992
// Get the index of the second to last layer; this is the
2993
2993
// default for models that have a llava projector
2994
- int n_layer = ctx->vision_model .hparams .n_layer - 1 ;
2994
+ const auto & hparams = ctx->vision_model .hparams ;
2995
+ int n_layer = hparams.n_layer - 1 ;
2995
2996
int deepest_feature_layer = -1 ;
2996
2997
2997
2998
// Handle other projectors; incrementing here indicates that we
@@ -3001,9 +3002,9 @@ int get_deepest_feature_layer(const struct clip_ctx * ctx) {
3001
3002
}
3002
3003
3003
3004
// If we set explicit vision feature layers, only go up to the deepest one
3004
- for (int i = 0 ; i < MAX_IMAGE_FEATURE_LAYERS; i++) {
3005
- if (ctx-> vision_model . hparams .vision_feature_layer [i] > deepest_feature_layer) {
3006
- deepest_feature_layer = ctx-> vision_model . hparams .vision_feature_layer [i];
3005
+ for (int i = 0 ; i < MAX_IMAGE_FEATURE_LAYERS && (hparams. vision_feature_layer [i] > 0 ) ; i++) {
3006
+ if (hparams.vision_feature_layer [i] > deepest_feature_layer) {
3007
+ deepest_feature_layer = hparams.vision_feature_layer [i];
3007
3008
}
3008
3009
}
3009
3010
return deepest_feature_layer < 0 ? n_layer: deepest_feature_layer;
0 commit comments