@@ -1114,15 +1114,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
1114
1114
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
1115
1115
int pos_w = image_size_width/patch_size;
1116
1116
int pos_h = image_size_height/patch_size;
1117
- if (ctx->minicpmv_version == 2 ) {
1118
- pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 4096 , pos_w * pos_h, 1 );
1119
- }
1120
- else if (ctx->minicpmv_version == 3 ) {
1121
- pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 3584 , pos_w * pos_h, 1 );
1122
- }
1123
- else if (ctx->minicpmv_version == 4 ) {
1124
- pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 3584 , pos_w * pos_h, 1 );
1125
- }
1117
+ int n_output_dim = clip_n_mmproj_embd (ctx);
1118
+ pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1 );
1126
1119
ggml_set_name (pos_embed, " pos_embed" );
1127
1120
ggml_set_input (pos_embed);
1128
1121
}
@@ -1460,23 +1453,17 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
1460
1453
}
1461
1454
1462
1455
{ // attention
1463
- int hidden_size = 4096 ;
1456
+ int hidden_size = clip_n_mmproj_embd (ctx) ;
1464
1457
const int d_head = 128 ;
1465
1458
int n_head = hidden_size/d_head;
1466
1459
int num_query = 96 ;
1467
1460
if (ctx->minicpmv_version == 2 ) {
1468
- hidden_size = 4096 ;
1469
- n_head = hidden_size/d_head;
1470
1461
num_query = 96 ;
1471
1462
}
1472
1463
else if (ctx->minicpmv_version == 3 ) {
1473
- hidden_size = 3584 ;
1474
- n_head = hidden_size/d_head;
1475
1464
num_query = 64 ;
1476
1465
}
1477
1466
else if (ctx->minicpmv_version == 4 ) {
1478
- hidden_size = 3584 ;
1479
- n_head = hidden_size/d_head;
1480
1467
num_query = 64 ;
1481
1468
}
1482
1469
@@ -3136,19 +3123,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3136
3123
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
3137
3124
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
3138
3125
struct ggml_tensor * pos_embed = ggml_graph_get_tensor (gf, " pos_embed" );
3139
- int embed_dim = 4096 ;
3140
- if (ctx->minicpmv_version == 2 ) {
3141
- embed_dim = 4096 ;
3142
- }
3143
- else if (ctx->minicpmv_version == 3 ) {
3144
- embed_dim = 3584 ;
3145
- }
3146
- else if (ctx->minicpmv_version == 4 ) {
3147
- embed_dim = 3584 ;
3148
- }
3149
- else {
3150
- GGML_ABORT (" Unknown minicpmv version" );
3151
- }
3126
+ int embed_dim = clip_n_mmproj_embd (ctx);
3152
3127
3153
3128
// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
3154
3129
auto pos_embed_t = get_2d_sincos_pos_embed (embed_dim, std::make_pair (pos_w, pos_h));
0 commit comments