Skip to content

Commit bc18073

Browse files
Fix unset vision layer 0
Signed-off-by: Alex-Brooks <[email protected]>
1 parent 47f47b2 commit bc18073

File tree

1 file changed

+9
-8
lines changed

1 file changed

+9
-8
lines changed

examples/llava/clip.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -770,8 +770,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
770770

771771
// If this is an embedding feature layer, save the output.
772772
// NOTE: 0 index here refers to the input to the encoder.
773-
for (int vf_layer_idx = 0; vf_layer_idx < MAX_IMAGE_FEATURE_LAYERS; vf_layer_idx++) {
774-
if (il == ctx->vision_model.hparams.vision_feature_layer[vf_layer_idx]) {
773+
for (int vl_idx = 0; vl_idx < MAX_IMAGE_FEATURE_LAYERS && (hparams.vision_feature_layer[vl_idx] > 0); vl_idx++) {
774+
if (il == ctx->vision_model.hparams.vision_feature_layer[vl_idx]) {
775775
embedding_stack.push_back(embeddings);
776776
break;
777777
}
@@ -875,8 +875,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
875875
}
876876

877877
// final layer is a vision feature layer
878-
for (int vf_layer_idx = 0; vf_layer_idx < MAX_IMAGE_FEATURE_LAYERS; vf_layer_idx++) {
879-
if (n_layer == ctx->vision_model.hparams.vision_feature_layer[vf_layer_idx]) {
878+
for (int vl_idx = 0; vl_idx < MAX_IMAGE_FEATURE_LAYERS && (hparams.vision_feature_layer[vl_idx] > 0); vl_idx++) {
879+
if (n_layer == ctx->vision_model.hparams.vision_feature_layer[vl_idx]) {
880880
embedding_stack.push_back(embeddings);
881881
break;
882882
}
@@ -2991,7 +2991,8 @@ size_t get_max_image_grid_pinpoints() {
29912991
int get_deepest_feature_layer(const struct clip_ctx * ctx) {
29922992
// Get the index of the second to last layer; this is the
29932993
// default for models that have a llava projector
2994-
int n_layer = ctx->vision_model.hparams.n_layer - 1;
2994+
const auto & hparams = ctx->vision_model.hparams;
2995+
int n_layer = hparams.n_layer - 1;
29952996
int deepest_feature_layer = -1;
29962997

29972998
// Handle other projectors; incrementing here indicates that we
@@ -3001,9 +3002,9 @@ int get_deepest_feature_layer(const struct clip_ctx * ctx) {
30013002
}
30023003

30033004
// If we set explicit vision feature layers, only go up to the deepest one
3004-
for (int i = 0; i < MAX_IMAGE_FEATURE_LAYERS; i++) {
3005-
if (ctx->vision_model.hparams.vision_feature_layer[i] > deepest_feature_layer) {
3006-
deepest_feature_layer = ctx->vision_model.hparams.vision_feature_layer[i];
3005+
for (int i = 0; i < MAX_IMAGE_FEATURE_LAYERS && (hparams.vision_feature_layer[i] > 0); i++) {
3006+
if (hparams.vision_feature_layer[i] > deepest_feature_layer) {
3007+
deepest_feature_layer = hparams.vision_feature_layer[i];
30073008
}
30083009
}
30093010
return deepest_feature_layer < 0 ? n_layer: deepest_feature_layer;

0 commit comments

Comments
 (0)