Skip to content

Commit a264a25

Browse files
committed
split qwen2 and qwen2.5 code blocks
1 parent ea24eb2 commit a264a25

File tree

1 file changed

+42
-29
lines changed

1 file changed

+42
-29
lines changed

examples/llava/clip.cpp

Lines changed: 42 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -3122,23 +3122,43 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
31223122
// inspired from resampler of Qwen-VL:
31233123
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
31243124
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
3125-
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
31263125
int embed_dim = clip_n_mmproj_embd(ctx);
31273126

31283127
// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
31293128
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
31303129

3131-
std::vector<float> pos_data(ggml_nelements(pos_embed));
3132-
float * data = pos_data.data();
3130+
std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
31333131
for(int i = 0; i < pos_w * pos_h; ++i){
31343132
for(int j = 0; j < embed_dim; ++j){
3135-
data[i * embed_dim + j] = pos_embed_t[i][j];
3133+
pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
31363134
}
31373135
}
31383136

3139-
ggml_backend_tensor_set(pos_embed, data, 0, ggml_nbytes(pos_embed));
3137+
set_input_f32("pos_embed", pos_embed);
31403138
} break;
31413139
case PROJECTOR_TYPE_QWEN2VL:
3140+
{
3141+
const int pw = image_size_width / patch_size;
3142+
const int ph = image_size_height / patch_size;
3143+
std::vector<int> positions(num_positions * 4);
3144+
3145+
int ptr = 0;
3146+
for (int y = 0; y < ph; y += 2) {
3147+
for (int x = 0; x < pw; x += 2) {
3148+
for (int dy = 0; dy < 2; dy++) {
3149+
for (int dx = 0; dx < 2; dx++) {
3150+
positions[ ptr] = y + dy;
3151+
positions[ num_patches + ptr] = x + dx;
3152+
positions[2 * num_patches + ptr] = y + dy;
3153+
positions[3 * num_patches + ptr] = x + dx;
3154+
ptr++;
3155+
}
3156+
}
3157+
}
3158+
}
3159+
3160+
set_input_i32("positions", positions);
3161+
} break;
31423162
case PROJECTOR_TYPE_QWEN25VL:
31433163
{
31443164
// pw * ph = number of tokens output by ViT after apply patch merger
@@ -3154,10 +3174,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
31543174

31553175
if (use_window_attn) {
31563176
const int attn_window_size = 112;
3157-
struct ggml_tensor * window_idx = ggml_graph_get_tensor(gf, "window_idx");
3158-
struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor(gf, "inv_window_idx");
3159-
struct ggml_tensor * window_mask = ggml_graph_get_tensor(gf, "window_mask");
3160-
31613177
const int grid_window = attn_window_size / patch_size / merge_ratio;
31623178
int dst = 0;
31633179
// [num_vision_tokens, num_vision_tokens] attention mask tensor
@@ -3175,8 +3191,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
31753191
for (int dy = 0; dy < win_h; dy++) {
31763192
for (int dx = 0; dx < win_w; dx++) {
31773193
const int src = (y + dy) * pw + (x + dx);
3178-
assert(src < (int)idx.size());
3179-
assert(dst < (int)inv_idx.size());
3194+
GGML_ASSERT(src < (int)idx.size());
3195+
GGML_ASSERT(dst < (int)inv_idx.size());
31803196
idx [src] = dst;
31813197
inv_idx[dst] = src;
31823198
dst++;
@@ -3194,40 +3210,37 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
31943210
}
31953211
}
31963212

3197-
ggml_backend_tensor_set(window_idx, idx.data(), 0, ggml_nbytes(window_idx));
3198-
ggml_backend_tensor_set(inv_window_idx, inv_idx.data(), 0, ggml_nbytes(inv_window_idx));
3199-
ggml_backend_tensor_set(window_mask, mask.data(), 0, ggml_nbytes(window_mask));
3213+
set_input_i32("window_idx", idx);
3214+
set_input_i32("inv_window_idx", inv_idx);
3215+
set_input_f32("window_mask", mask);
32003216
} else {
3201-
std::iota(idx.begin(), idx.end(), 0);
3202-
// std::iota(inv_idx.begin(), inv_idx.end(), 0);
3217+
for (int i = 0; i < ph * pw; i++) {
3218+
idx[i] = i;
3219+
}
32033220
}
32043221

3205-
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
32063222
const int mpow = merge_ratio * merge_ratio;
3207-
std::vector<int> positions_data(ggml_nelements(positions));
3208-
int * data = positions_data.data();
3223+
std::vector<int> positions(num_positions * 4);
32093224

32103225
int ptr = 0;
3211-
for (int y = 0; y < iph; y += merge_ratio)
3212-
{
3213-
for (int x = 0; x < ipw; x += merge_ratio)
3214-
{
3226+
for (int y = 0; y < iph; y += merge_ratio) {
3227+
for (int x = 0; x < ipw; x += merge_ratio) {
32153228
for (int dy = 0; dy < 2; dy++) {
32163229
for (int dx = 0; dx < 2; dx++) {
32173230
auto remap = idx[ptr / mpow];
3218-
remap = remap * mpow + (ptr % mpow);
3231+
remap = (remap * mpow) + (ptr % mpow);
32193232

3220-
data[ remap] = y + dy;
3221-
data[ num_patches + remap] = x + dx;
3222-
data[2 * num_patches + remap] = y + dy;
3223-
data[3 * num_patches + remap] = x + dx;
3233+
positions[ remap] = y + dy;
3234+
positions[ num_patches + remap] = x + dx;
3235+
positions[2 * num_patches + remap] = y + dy;
3236+
positions[3 * num_patches + remap] = x + dx;
32243237
ptr++;
32253238
}
32263239
}
32273240
}
32283241
}
32293242

3230-
ggml_backend_tensor_set(positions, data, 0, ggml_nbytes(positions));
3243+
set_input_i32("positions", positions);
32313244
} break;
32323245
case PROJECTOR_TYPE_PIXTRAL:
32333246
{

0 commit comments

Comments
 (0)