We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 9a390c4 commit c104023Copy full SHA for c104023
tools/mtmd/clip.cpp
@@ -879,9 +879,15 @@ struct clip_graph {
879
// add CLS token
880
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
881
882
+ // The larger models use a different ViT, which uses RMS norm instead of layer norm
883
+ // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
884
+ norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
885
+ ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
886
+ : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
887
+
888
ggml_tensor * cur = build_vit(
889
inp, n_pos,
- NORM_TYPE_NORMAL,
890
+ norm_t,
891
hparams.ffn_op,
892
model.position_embeddings,
893
nullptr);
0 commit comments