|
26 | 26 | #include <sstream>
|
27 | 27 | #include <cinttypes>
|
28 | 28 | #include <limits>
|
| 29 | +#include <numeric> |
29 | 30 |
|
30 | 31 | #if defined(LLAVA_LOG_OFF)
|
31 | 32 | # define LOG_INF(...)
|
@@ -984,13 +985,13 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
|
984 | 985 | embeddings = ggml_get_rows(ctx0, embeddings, inv_window_idx);
|
985 | 986 | embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, patches_w * patches_h, batch_size);
|
986 | 987 |
|
987 |
| - positions = ggml_reshape_2d(ctx0, positions, num_position_ids / 4, 4); |
988 |
| - positions = ggml_cont(ctx0, ggml_permute(ctx0, positions, 1, 0, 2, 3)); |
989 |
| - positions = ggml_reshape_2d(ctx0, positions, 16, num_position_ids / 16); |
990 |
| - positions = ggml_get_rows(ctx0, positions, inv_window_idx); |
991 |
| - positions = ggml_reshape_2d(ctx0, positions, 4, num_position_ids / 4); |
992 |
| - positions = ggml_cont(ctx0, ggml_permute(ctx0, positions, 1, 0, 2, 3)); |
993 |
| - positions = ggml_reshape_1d(ctx0, positions, num_position_ids); |
| 988 | + // positions = ggml_reshape_2d(ctx0, positions, num_position_ids / 4, 4); |
| 989 | + // positions = ggml_cont(ctx0, ggml_permute(ctx0, positions, 1, 0, 2, 3)); |
| 990 | + // positions = ggml_reshape_2d(ctx0, positions, 16, num_position_ids / 16); |
| 991 | + // positions = ggml_get_rows(ctx0, positions, inv_window_idx); |
| 992 | + // positions = ggml_reshape_2d(ctx0, positions, 4, num_position_ids / 4); |
| 993 | + // positions = ggml_cont(ctx0, ggml_permute(ctx0, positions, 1, 0, 2, 3)); |
| 994 | + // positions = ggml_reshape_1d(ctx0, positions, num_position_ids); |
994 | 995 |
|
995 | 996 | // ggml_build_forward_expand(gf, embeddings);
|
996 | 997 | // ggml_free(ctx0);
|
@@ -3095,33 +3096,97 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
3095 | 3096 | }
|
3096 | 3097 |
|
3097 | 3098 | if (ctx->has_qwen2vl_merger) {
|
| 3099 | + /* |
| 3100 | + pw * ph = number of tokens output by ViT after apply patch merger |
| 3101 | + ipw * ipw = number of vision token been processed inside ViT |
| 3102 | + */ |
| 3103 | + const int merge_ratio = 2; |
| 3104 | + const int pw = image_size_width / patch_size / merge_ratio; |
| 3105 | + const int ph = image_size_height / patch_size / merge_ratio; |
| 3106 | + const int ipw = image_size_width / patch_size; |
| 3107 | + const int iph = image_size_height / patch_size; |
| 3108 | + |
| 3109 | + std::vector<int> idx(ph * pw); |
| 3110 | + std::vector<int> inv_idx(ph * pw); |
| 3111 | + |
| 3112 | + if (hparams.attn_window_size > 0) { |
| 3113 | + struct ggml_tensor * window_idx = ggml_graph_get_tensor(gf, "window_idx"); |
| 3114 | + struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor(gf, "inv_window_idx"); |
| 3115 | + struct ggml_tensor * window_mask = ggml_graph_get_tensor(gf, "window_mask"); |
| 3116 | + |
| 3117 | + const int grid_window = hparams.attn_window_size / patch_size / merge_ratio; |
| 3118 | + int dst = 0; |
| 3119 | + // [num_vision_tokens, num_vision_tokens] attention mask tensor |
| 3120 | + std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest()); |
| 3121 | + int mask_row = 0; |
| 3122 | + |
| 3123 | + for (int y = 0; y < ph; y+=grid_window) |
| 3124 | + { |
| 3125 | + for (int x = 0; x < pw; x+=grid_window) |
| 3126 | + { |
| 3127 | + const int win_h = std::min(grid_window, ph - y); |
| 3128 | + const int win_w = std::min(grid_window, pw - x); |
| 3129 | + const int dst_0 = dst; |
| 3130 | + // group all tokens belong to the same window togather (to a continue range) |
| 3131 | + for (int dy = 0; dy < win_h; dy++) { |
| 3132 | + for (int dx = 0; dx < win_w; dx++) { |
| 3133 | + const int src = (y + dy) * pw + (x + dx); |
| 3134 | + assert(src < (int)idx.size()); |
| 3135 | + assert(dst < (int)inv_idx.size()); |
| 3136 | + idx[src] = dst; |
| 3137 | + inv_idx[dst] = src; |
| 3138 | + dst++; |
| 3139 | + } |
| 3140 | + } |
| 3141 | + |
| 3142 | + for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) { |
| 3143 | + int row_offset = mask_row * (ipw * iph); |
| 3144 | + std::fill( |
| 3145 | + mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio), |
| 3146 | + mask.begin() + row_offset + (dst * merge_ratio * merge_ratio), |
| 3147 | + 0.0); |
| 3148 | + mask_row++; |
| 3149 | + } |
| 3150 | + } |
| 3151 | + } |
| 3152 | + |
| 3153 | + if (window_idx) ggml_backend_tensor_set(window_idx, idx.data(), 0, ggml_nbytes(window_idx)); |
| 3154 | + if (inv_window_idx) ggml_backend_tensor_set(inv_window_idx, inv_idx.data(), 0, ggml_nbytes(inv_window_idx)); |
| 3155 | + if (window_mask) ggml_backend_tensor_set(window_mask, mask.data(), 0, ggml_nbytes(window_mask)); |
| 3156 | + } else { |
| 3157 | + std::iota(idx.begin(), idx.end(), 0); |
| 3158 | + std::iota(inv_idx.begin(), inv_idx.end(), 0); |
| 3159 | + } |
| 3160 | + |
3098 | 3161 | struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
3099 |
| - if (positions) { |
3100 | 3162 |
|
3101 |
| - const int pw = image_size_width / patch_size; |
3102 |
| - const int ph = image_size_height / patch_size; |
| 3163 | + // const int pw = image_size_width / patch_size; |
| 3164 | + // const int ph = image_size_height / patch_size; |
| 3165 | + const int mpow = (merge_ratio * merge_ratio); |
3103 | 3166 | int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
3104 | 3167 |
|
3105 | 3168 | int ptr = 0;
|
3106 |
| - for (int y = 0; y < ph; y+=2) |
| 3169 | + for (int y = 0; y < iph; y+=merge_ratio) |
3107 | 3170 | {
|
3108 |
| - for (int x = 0; x < pw; x+=2) |
| 3171 | + for (int x = 0; x < ipw; x+=merge_ratio) |
3109 | 3172 | {
|
3110 | 3173 | for (int dy = 0; dy < 2; dy++) {
|
3111 | 3174 | for (int dx = 0; dx < 2; dx++) {
|
3112 |
| - positions_data[ptr] = y + dy; |
3113 |
| - positions_data[num_patches + ptr] = x + dx; |
3114 |
| - positions_data[num_patches * 2 + ptr] = y + dy; |
3115 |
| - positions_data[num_patches * 3 + ptr] = x + dx; |
| 3175 | + auto remap = idx[ptr / mpow]; |
| 3176 | + remap = remap * mpow + (ptr % mpow); |
| 3177 | + |
| 3178 | + positions_data[remap] = y + dy; |
| 3179 | + positions_data[num_patches + remap] = x + dx; |
| 3180 | + positions_data[num_patches * 2 + remap] = y + dy; |
| 3181 | + positions_data[num_patches * 3 + remap] = x + dx; |
3116 | 3182 | ptr++;
|
3117 | 3183 | }
|
3118 | 3184 | }
|
3119 | 3185 | }
|
3120 | 3186 | }
|
3121 | 3187 |
|
3122 |
| - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); |
| 3188 | + if (positions) ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); |
3123 | 3189 | free(positions_data);
|
3124 |
| - } |
3125 | 3190 | }
|
3126 | 3191 | else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
|
3127 | 3192 | // do nothing
|
|
0 commit comments