1
1
import argparse
2
- import glob
3
- import os
4
- from typing import Any , Dict
2
+ from typing import Dict
5
3
6
4
import torch
5
+ import numpy as np
7
6
from gguf import *
8
7
from transformers import (
9
- Qwen2VLForConditionalGeneration ,
8
+ Qwen2VLForConditionalGeneration ,
10
9
Qwen2VLProcessor ,
11
10
AutoProcessor ,
12
11
Qwen2VLConfig
@@ -44,7 +43,7 @@ def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
44
43
else : # bias
45
44
c3 = ten .shape [0 ]
46
45
assert c3 % 3 == 0
47
- c = c3 // 3
46
+ c = c3 // 3
48
47
wq = ten [:c ]
49
48
wk = ten [c : c * 2 ]
50
49
wv = ten [c * 2 :]
@@ -68,7 +67,7 @@ def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
68
67
tensor_map ["v.patch_embd.weight.1" ] = ten [:, :, 1 , ...]
69
68
else :
70
69
tensor_map [to_gguf_name (f"vision_model.{ name } " )] = ten
71
-
70
+
72
71
for new_name , ten in tensor_map .items ():
73
72
if ten .ndim <= 1 or new_name .endswith ("_norm.weight" ):
74
73
tensor_map [new_name ] = ten .astype (np .float32 )
@@ -89,16 +88,14 @@ def main(args):
89
88
ftype = 1
90
89
else :
91
90
raise ValueError ()
92
-
91
+
93
92
model_name = args .model_name
94
93
print ("model_name: " , model_name )
95
94
qwen2vl = Qwen2VLForConditionalGeneration .from_pretrained (
96
95
model_name , torch_dtype = dtype , device_map = "cpu"
97
96
)
98
- cfg : Qwen2VLConfig = qwen2vl .config
97
+ cfg : Qwen2VLConfig = qwen2vl .config # type: ignore[reportAssignmentType]
99
98
vcfg = cfg .vision_config
100
- rope_cfg = cfg .rope_scaling
101
-
102
99
103
100
fname_out = "qwen2vl-vision.gguf"
104
101
fout = GGUFWriter (path = fname_out , arch = "clip" )
@@ -125,23 +122,22 @@ def main(args):
125
122
fout .add_tensor (name , data )
126
123
127
124
fout .add_uint32 ("clip.vision.patch_size" , vcfg .patch_size )
128
- fout .add_uint32 ("clip.vision.image_size" , 14 * 40 ) # some reasonable size that is divable by (14*2)
125
+ fout .add_uint32 ("clip.vision.image_size" , 14 * 40 ) # some reasonable size that is divable by (14*2)
129
126
fout .add_uint32 (k (KEY_EMBEDDING_LENGTH , VISION ), vcfg .embed_dim )
130
127
fout .add_uint32 ("clip.vision.projection_dim" , vcfg .hidden_size )
131
128
fout .add_uint32 (k (KEY_ATTENTION_HEAD_COUNT , VISION ), vcfg .num_heads )
132
129
fout .add_float32 (k (KEY_ATTENTION_LAYERNORM_EPS , VISION ), 1e-6 )
133
130
fout .add_uint32 (k (KEY_BLOCK_COUNT , VISION ), vcfg .depth )
134
- fout .add_uint32 (k (KEY_FEED_FORWARD_LENGTH , VISION ), 0 ) # BUG: not sure what this does
131
+ fout .add_uint32 (k (KEY_FEED_FORWARD_LENGTH , VISION ), 0 ) # not sure what this does, put 0 here as a placeholder
135
132
fout .add_name (model_name )
136
133
"""
137
- HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
134
+ HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
138
135
it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
139
136
"""
140
137
141
138
processor : Qwen2VLProcessor = AutoProcessor .from_pretrained (model_name )
142
- # breakpoint()
143
- fout .add_array ("clip.vision.image_mean" , processor .image_processor .image_mean )
144
- fout .add_array ("clip.vision.image_std" , processor .image_processor .image_std )
139
+ fout .add_array ("clip.vision.image_mean" , processor .image_processor .image_mean ) # type: ignore[reportAttributeAccessIssue]
140
+ fout .add_array ("clip.vision.image_std" , processor .image_processor .image_std ) # type: ignore[reportAttributeAccessIssue]
145
141
146
142
fout .write_header_to_file ()
147
143
fout .write_kv_data_to_file ()
@@ -154,4 +150,4 @@ def main(args):
154
150
parser .add_argument ("model_name" , nargs = '?' , default = "Qwen/Qwen2-VL-2B-Instruct" )
155
151
parser .add_argument ("--data_type" , nargs = '?' , choices = ['fp32' , 'fp16' ], default = "fp32" )
156
152
args = parser .parse_args ()
157
- main (args )
153
+ main (args )
0 commit comments