Skip to content

Commit 92952d6

Browse files
Caculate max feature layer at load time
Signed-off-by: Alex-Brooks <[email protected]>
1 parent 11149b6 commit 92952d6

File tree

1 file changed

+6
-5
lines changed

1 file changed

+6
-5
lines changed

examples/llava/clip.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,7 @@ struct clip_ctx {
587587
struct clip_vision_model vision_model;
588588
projector_type proj_type = PROJECTOR_TYPE_MLP;
589589

590+
int32_t max_feature_layer;
590591
float image_mean[3];
591592
float image_std[3];
592593
bool use_gelu = false;
@@ -755,12 +756,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
755756
}
756757

757758
std::vector<struct ggml_tensor *> embedding_stack;
758-
// Check to see if we have 1+ set vision feature layers set; otherwise it's determined
759-
// by the type of projector that this model has (usually last or second to last layer).
760-
int max_feature_layer = get_deepest_feature_layer(ctx);
761759

762760
// loop over layers
763-
for (int il = 0; il < max_feature_layer; il++) {
761+
for (int il = 0; il < ctx->max_feature_layer; il++) {
764762
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
765763

766764
// If this is an embedding feature layer, save the output.
@@ -862,7 +860,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
862860
}
863861

864862
// post-layernorm
865-
if (ctx->has_post_norm && max_feature_layer == n_layer) {
863+
if (ctx->has_post_norm && ctx->max_feature_layer == n_layer) {
866864
embeddings = ggml_norm(ctx0, embeddings, eps);
867865
ggml_set_name(embeddings, "post_ln");
868866

@@ -1516,6 +1514,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
15161514
new_clip->image_std[i] = std_data[i];
15171515
}
15181516

1517+
// Calculate the deepest feature layer based on hparams and projector type
1518+
new_clip->max_feature_layer = get_deepest_feature_layer(new_clip);
1519+
15191520
if (verbosity >= 2) {
15201521
LOG_INF("\n%s: vision model hparams\n", __func__);
15211522
LOG_INF("image_size %d\n", hparams.image_size);

0 commit comments

Comments
 (0)