Skip to content

Commit ea24eb2

Browse files
committed
minicpmv : use clip_n_mmproj_embd instead of copying the same code everywhere
1 parent 6bac190 commit ea24eb2

File tree

1 file changed

+4
-29
lines changed

1 file changed

+4
-29
lines changed

examples/llava/clip.cpp

Lines changed: 4 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1114,15 +1114,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
11141114
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
11151115
int pos_w = image_size_width/patch_size;
11161116
int pos_h = image_size_height/patch_size;
1117-
if (ctx->minicpmv_version == 2) {
1118-
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
1119-
}
1120-
else if (ctx->minicpmv_version == 3) {
1121-
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
1122-
}
1123-
else if (ctx->minicpmv_version == 4) {
1124-
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
1125-
}
1117+
int n_output_dim = clip_n_mmproj_embd(ctx);
1118+
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1);
11261119
ggml_set_name(pos_embed, "pos_embed");
11271120
ggml_set_input(pos_embed);
11281121
}
@@ -1460,23 +1453,17 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
14601453
}
14611454

14621455
{ // attention
1463-
int hidden_size = 4096;
1456+
int hidden_size = clip_n_mmproj_embd(ctx);
14641457
const int d_head = 128;
14651458
int n_head = hidden_size/d_head;
14661459
int num_query = 96;
14671460
if (ctx->minicpmv_version == 2) {
1468-
hidden_size = 4096;
1469-
n_head = hidden_size/d_head;
14701461
num_query = 96;
14711462
}
14721463
else if (ctx->minicpmv_version == 3) {
1473-
hidden_size = 3584;
1474-
n_head = hidden_size/d_head;
14751464
num_query = 64;
14761465
}
14771466
else if (ctx->minicpmv_version == 4) {
1478-
hidden_size = 3584;
1479-
n_head = hidden_size/d_head;
14801467
num_query = 64;
14811468
}
14821469

@@ -3136,19 +3123,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
31363123
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
31373124
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
31383125
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
3139-
int embed_dim = 4096;
3140-
if (ctx->minicpmv_version == 2) {
3141-
embed_dim = 4096;
3142-
}
3143-
else if (ctx->minicpmv_version == 3) {
3144-
embed_dim = 3584;
3145-
}
3146-
else if (ctx->minicpmv_version == 4) {
3147-
embed_dim = 3584;
3148-
}
3149-
else {
3150-
GGML_ABORT("Unknown minicpmv version");
3151-
}
3126+
int embed_dim = clip_n_mmproj_embd(ctx);
31523127

31533128
// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
31543129
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));

0 commit comments

Comments
 (0)