@@ -1718,7 +1718,8 @@ struct clip_model_loader {
1718
1718
1719
1719
if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV
1720
1720
|| ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE
1721
- || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL) {
1721
+ || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL
1722
+ || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN25VL) {
1722
1723
n_layer += 1 ;
1723
1724
}
1724
1725
@@ -2744,7 +2745,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
2744
2745
}
2745
2746
return true ;
2746
2747
}
2747
- else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
2748
+ else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx-> proj_type == PROJECTOR_TYPE_QWEN25VL ) {
2748
2749
clip_image_u8 resized;
2749
2750
auto patch_size = clip_get_patch_size (ctx) * 2 ;
2750
2751
int nx = ceil ((float )img->nx / patch_size) * patch_size;
@@ -3139,7 +3140,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3139
3140
else {
3140
3141
// non-minicpmv models
3141
3142
3142
- if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
3143
+ if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx-> proj_type == PROJECTOR_TYPE_QWEN25VL ) {
3143
3144
// pw * ph = number of tokens output by ViT after apply patch merger
3144
3145
// ipw * ipw = number of vision token been processed inside ViT
3145
3146
const int merge_ratio = 2 ;
@@ -3279,7 +3280,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3279
3280
}
3280
3281
}
3281
3282
3282
- if (use_window_attn && ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
3283
+ if (use_window_attn && ( ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx-> proj_type == PROJECTOR_TYPE_QWEN25VL) ) {
3283
3284
struct ggml_tensor * window_idx = ggml_graph_get_tensor (gf, " window_idx" );
3284
3285
struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor (gf, " inv_window_idx" );
3285
3286
struct ggml_tensor * window_mask = ggml_graph_get_tensor (gf, " window_mask" );
0 commit comments