@@ -3122,23 +3122,43 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3122
3122
// inspired from resampler of Qwen-VL:
3123
3123
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
3124
3124
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
3125
- struct ggml_tensor * pos_embed = ggml_graph_get_tensor (gf, " pos_embed" );
3126
3125
int embed_dim = clip_n_mmproj_embd (ctx);
3127
3126
3128
3127
// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
3129
3128
auto pos_embed_t = get_2d_sincos_pos_embed (embed_dim, std::make_pair (pos_w, pos_h));
3130
3129
3131
- std::vector<float > pos_data (ggml_nelements (pos_embed));
3132
- float * data = pos_data.data ();
3130
+ std::vector<float > pos_embed (embed_dim * pos_w * pos_h);
3133
3131
for (int i = 0 ; i < pos_w * pos_h; ++i){
3134
3132
for (int j = 0 ; j < embed_dim; ++j){
3135
- data [i * embed_dim + j] = pos_embed_t [i][j];
3133
+ pos_embed [i * embed_dim + j] = pos_embed_t [i][j];
3136
3134
}
3137
3135
}
3138
3136
3139
- ggml_backend_tensor_set ( pos_embed, data, 0 , ggml_nbytes ( pos_embed) );
3137
+ set_input_f32 ( " pos_embed" , pos_embed);
3140
3138
} break ;
3141
3139
case PROJECTOR_TYPE_QWEN2VL:
3140
+ {
3141
+ const int pw = image_size_width / patch_size;
3142
+ const int ph = image_size_height / patch_size;
3143
+ std::vector<int > positions (num_positions * 4 );
3144
+
3145
+ int ptr = 0 ;
3146
+ for (int y = 0 ; y < ph; y += 2 ) {
3147
+ for (int x = 0 ; x < pw; x += 2 ) {
3148
+ for (int dy = 0 ; dy < 2 ; dy++) {
3149
+ for (int dx = 0 ; dx < 2 ; dx++) {
3150
+ positions[ ptr] = y + dy;
3151
+ positions[ num_patches + ptr] = x + dx;
3152
+ positions[2 * num_patches + ptr] = y + dy;
3153
+ positions[3 * num_patches + ptr] = x + dx;
3154
+ ptr++;
3155
+ }
3156
+ }
3157
+ }
3158
+ }
3159
+
3160
+ set_input_i32 (" positions" , positions);
3161
+ } break ;
3142
3162
case PROJECTOR_TYPE_QWEN25VL:
3143
3163
{
3144
3164
// pw * ph = number of tokens output by ViT after apply patch merger
@@ -3154,10 +3174,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3154
3174
3155
3175
if (use_window_attn) {
3156
3176
const int attn_window_size = 112 ;
3157
- struct ggml_tensor * window_idx = ggml_graph_get_tensor (gf, " window_idx" );
3158
- struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor (gf, " inv_window_idx" );
3159
- struct ggml_tensor * window_mask = ggml_graph_get_tensor (gf, " window_mask" );
3160
-
3161
3177
const int grid_window = attn_window_size / patch_size / merge_ratio;
3162
3178
int dst = 0 ;
3163
3179
// [num_vision_tokens, num_vision_tokens] attention mask tensor
@@ -3175,8 +3191,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3175
3191
for (int dy = 0 ; dy < win_h; dy++) {
3176
3192
for (int dx = 0 ; dx < win_w; dx++) {
3177
3193
const int src = (y + dy) * pw + (x + dx);
3178
- assert (src < (int )idx.size ());
3179
- assert (dst < (int )inv_idx.size ());
3194
+ GGML_ASSERT (src < (int )idx.size ());
3195
+ GGML_ASSERT (dst < (int )inv_idx.size ());
3180
3196
idx [src] = dst;
3181
3197
inv_idx[dst] = src;
3182
3198
dst++;
@@ -3194,40 +3210,37 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3194
3210
}
3195
3211
}
3196
3212
3197
- ggml_backend_tensor_set ( window_idx, idx. data (), 0 , ggml_nbytes (window_idx) );
3198
- ggml_backend_tensor_set ( inv_window_idx, inv_idx. data (), 0 , ggml_nbytes (inv_window_idx) );
3199
- ggml_backend_tensor_set ( window_mask, mask. data (), 0 , ggml_nbytes (window_mask) );
3213
+ set_input_i32 ( " window_idx" , idx);
3214
+ set_input_i32 ( " inv_window_idx" , inv_idx);
3215
+ set_input_f32 ( " window_mask" , mask);
3200
3216
} else {
3201
- std::iota (idx.begin (), idx.end (), 0 );
3202
- // std::iota(inv_idx.begin(), inv_idx.end(), 0);
3217
+ for (int i = 0 ; i < ph * pw; i++) {
3218
+ idx[i] = i;
3219
+ }
3203
3220
}
3204
3221
3205
- struct ggml_tensor * positions = ggml_graph_get_tensor (gf, " positions" );
3206
3222
const int mpow = merge_ratio * merge_ratio;
3207
- std::vector<int > positions_data (ggml_nelements (positions));
3208
- int * data = positions_data.data ();
3223
+ std::vector<int > positions (num_positions * 4 );
3209
3224
3210
3225
int ptr = 0 ;
3211
- for (int y = 0 ; y < iph; y += merge_ratio)
3212
- {
3213
- for (int x = 0 ; x < ipw; x += merge_ratio)
3214
- {
3226
+ for (int y = 0 ; y < iph; y += merge_ratio) {
3227
+ for (int x = 0 ; x < ipw; x += merge_ratio) {
3215
3228
for (int dy = 0 ; dy < 2 ; dy++) {
3216
3229
for (int dx = 0 ; dx < 2 ; dx++) {
3217
3230
auto remap = idx[ptr / mpow];
3218
- remap = remap * mpow + (ptr % mpow);
3231
+ remap = ( remap * mpow) + (ptr % mpow);
3219
3232
3220
- data [ remap] = y + dy;
3221
- data [ num_patches + remap] = x + dx;
3222
- data [2 * num_patches + remap] = y + dy;
3223
- data [3 * num_patches + remap] = x + dx;
3233
+ positions [ remap] = y + dy;
3234
+ positions [ num_patches + remap] = x + dx;
3235
+ positions [2 * num_patches + remap] = y + dy;
3236
+ positions [3 * num_patches + remap] = x + dx;
3224
3237
ptr++;
3225
3238
}
3226
3239
}
3227
3240
}
3228
3241
}
3229
3242
3230
- ggml_backend_tensor_set ( positions, data, 0 , ggml_nbytes ( positions) );
3243
+ set_input_i32 ( " positions" , positions);
3231
3244
} break ;
3232
3245
case PROJECTOR_TYPE_PIXTRAL:
3233
3246
{
0 commit comments