Skip to content

Commit 90eefc2

Browse files
committed
refactor minicpm-v support
1 parent 0959cc1 commit 90eefc2

File tree

5 files changed

+186
-136
lines changed

5 files changed

+186
-136
lines changed

convert_hf_to_gguf.py

Lines changed: 47 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1008,6 +1008,29 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab
10081008
self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
10091009

10101010

1011+
# TODO: maybe merge this with Model in the future
1012+
class VisionModelHelper:
1013+
model: Model
1014+
tok_embd_tensor: Tensor | None = None
1015+
1016+
def __init__(self, model: Model):
1017+
self.model = model
1018+
# TODO: how to do this without reading the whole safetensor file?
1019+
for tname, tensor in model.get_tensors():
1020+
if tname.endswith("embed_tokens.weight"):
1021+
self.tok_embd_tensor = tensor
1022+
1023+
def get_embd_for_tokens(self, map_token_to_tensor_name: Iterable[tuple[str, gguf.MODEL_TENSOR]], tensor_name_postfix = '.weight') -> Iterable[tuple[str, Tensor]]:
1024+
if self.tok_embd_tensor is None:
1025+
raise ValueError("Token embedding tensor not found")
1026+
from transformers import AutoTokenizer
1027+
tokenizer = AutoTokenizer.from_pretrained(self.model.dir_model, trust_remote_code=True)
1028+
for token, tensor_name in map_token_to_tensor_name:
1029+
tok_id = tokenizer.get_vocab()[token]
1030+
row = self.tok_embd_tensor[tok_id]
1031+
yield gguf.TENSOR_NAMES[tensor_name] + tensor_name_postfix, row
1032+
1033+
10111034
@Model.register("GPTNeoXForCausalLM")
10121035
class GPTNeoXModel(Model):
10131036
model_arch = gguf.MODEL_ARCH.GPTNEOX
@@ -2355,11 +2378,11 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
23552378

23562379
@Model.register("MiniCPMV")
23572380
class MiniCPMVModel(Qwen2Model):
2358-
# based on minicpmv-surgery.py, not sure why it is Qwen2Model instead of MiniCPMModel
2381+
# MiniCPM-V 2.5 is Qwen2 and 2.6 is Qwen-2.5
23592382
model_arch = gguf.MODEL_ARCH.QWEN2
23602383
proj_type: gguf.constants.CLIPProjectorType | None
23612384
resampler_n_embd = 0
2362-
tok_embd_tensor: Tensor | None = None
2385+
vhelper: VisionModelHelper | None
23632386

23642387
def __init__(self, *args, **kwargs):
23652388
super().__init__(*args, **kwargs)
@@ -2378,56 +2401,49 @@ def __init__(self, *args, **kwargs):
23782401
self.proj_type = gguf.constants.CLIPProjectorType.MINICPMV_2_6
23792402
else:
23802403
raise ValueError(f"Unsupported MiniCPM-V version: {version}")
2404+
self.vhelper = VisionModelHelper(self)
23812405
# TODO: how to do this without reading the whole safetensor file?
23822406
for tname, tensor in self.get_tensors():
23832407
if tname == "resampler.ln_post.bias":
23842408
self.resampler_n_embd = tensor.shape[0]
2385-
if tname.endswith("embed_tokens.weight"):
2386-
self.tok_embd_tensor = tensor
23872409
if self.resampler_n_embd < 2:
23882410
raise ValueError("Failed to detect resampler embedding size")
23892411
else:
23902412
raise ValueError("Expected vision_config, but not found")
23912413

2392-
if self.vparams is not None and self.vision_arch is not None and self.preprocessor_config is not None:
2393-
self.preprocessor_config["image_mean"] = [0.5, 0.5, 0.5]
2394-
self.preprocessor_config["image_std"] = [0.5, 0.5, 0.5]
2395-
self.hparams["vision_feature_layer"] = 0
2396-
self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"])
2397-
2398-
def get_embd_of_tokens(self, map_token_to_tensor_name: Iterable[tuple[str, str]]) -> Iterable[tuple[str, Tensor]]:
2399-
if self.tok_embd_tensor is None:
2400-
raise ValueError("Token embedding tensor not found")
2401-
from transformers import AutoTokenizer
2402-
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
2403-
for token, tensor_name in map_token_to_tensor_name:
2404-
tok_id = tokenizer.get_vocab()[token]
2405-
row = self.tok_embd_tensor[tok_id]
2406-
yield tensor_name, row
2414+
assert self.vparams is not None
2415+
assert self.vision_arch is not None
2416+
assert self.preprocessor_config is not None
2417+
self.preprocessor_config["image_mean"] = [0.5, 0.5, 0.5]
2418+
self.preprocessor_config["image_std"] = [0.5, 0.5, 0.5]
2419+
self.hparams["vision_feature_layer"] = 0
2420+
self.v_tensor_map = gguf.get_tensor_name_map(self.vision_arch, self.vparams["num_hidden_layers"])
24072421

24082422
def set_gguf_parameters(self):
24092423
super().set_gguf_parameters()
2410-
# For vision model
2411-
if self.vparams is not None and self.proj_type is not None:
2412-
self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
2413-
self.gguf_writer.add_vision_vit_projector_type(self.proj_type)
2414-
self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-06)
2415-
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2
2416-
self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd)
2424+
assert self.vparams is not None and self.proj_type is not None
2425+
self.gguf_writer.add_vision_vit_patch_merge_type(gguf.CLIPPatchMergeType.FLAT)
2426+
self.gguf_writer.add_vision_vit_projector_type(self.proj_type)
2427+
self.gguf_writer.add_vision_vit_layer_norm_epsilon(1e-06)
2428+
max_pos_embd = (self.vparams["image_size"] // self.vparams["patch_size"])**2
2429+
self.gguf_writer.add_vision_vit_max_position_embeddings(max_pos_embd)
24172430

24182431

24192432
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2433+
# because the model operates excusively on 70x70 patches for now, we should precompute the positional embeddings to gain performance
2434+
# in the future, we can do it in cpp if we figure out how to do it efficiently
24202435
yield (
24212436
self.format_tensor_name(gguf.MODEL_TENSOR.V_RESMPL_POS_EMBD_K, is_vision=True),
24222437
torch.from_numpy(self._get_2d_sincos_pos_embed(self.resampler_n_embd, (70, 70)))
24232438
)
2439+
assert self.vhelper is not None
24242440
added_tokens = [
2425-
("<image>", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMAGE ] + ".weight"),
2426-
("</image>", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_END_IMAGE] + ".weight"),
2427-
("<slice>", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_SLICE ] + ".weight"),
2428-
("</slice>", gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_END_SLICE] + ".weight"),
2441+
("<image>", gguf.MODEL_TENSOR.V_TOK_EMBD_IMAGE),
2442+
("</image>", gguf.MODEL_TENSOR.V_TOK_EMBD_END_IMAGE),
2443+
("<slice>", gguf.MODEL_TENSOR.V_TOK_EMBD_SLICE),
2444+
("</slice>", gguf.MODEL_TENSOR.V_TOK_EMBD_END_SLICE),
24292445
]
2430-
for tensor_name, tensor in self.get_embd_of_tokens(added_tokens):
2446+
for tensor_name, tensor in self.vhelper.get_embd_for_tokens(added_tokens):
24312447
yield tensor_name, tensor
24322448

24332449
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:

src/llama-arch.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1559,9 +1559,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
15591559
{LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
15601560
{LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
15611561
// vision
1562-
{LLM_TENSOR_V_MMPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1563-
{LLM_TENSOR_V_MMPROJ_MLP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1564-
{LLM_TENSOR_V_MMPROJ_PEG, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1562+
{LLM_TENSOR_V_MMPROJ, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
1563+
{LLM_TENSOR_V_MMPROJ_MLP, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
1564+
{LLM_TENSOR_V_MMPROJ_PEG, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
15651565
{LLM_TENSOR_V_ENC_EMBD_CLS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_ADD}},
15661566
{LLM_TENSOR_V_ENC_EMBD_PATCH, {LLM_TENSOR_LAYER_INPUT, GGML_OP_ADD}},
15671567
{LLM_TENSOR_V_ENC_EMBD_POS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_ADD}},
@@ -1575,7 +1575,22 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
15751575
{LLM_TENSOR_V_ENC_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
15761576
{LLM_TENSOR_V_PRE_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
15771577
{LLM_TENSOR_V_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
1578-
// TODO: add minicpmv resampler tensors
1578+
{LLM_TENSOR_V_RESMPL_POS_EMBD_K, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_ADD}},
1579+
{LLM_TENSOR_V_RESMPL_ATTN_Q, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
1580+
{LLM_TENSOR_V_RESMPL_ATTN_K, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
1581+
{LLM_TENSOR_V_RESMPL_ATTN_V, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
1582+
{LLM_TENSOR_V_RESMPL_ATTN_OUT, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
1583+
{LLM_TENSOR_V_RESMPL_KV, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
1584+
{LLM_TENSOR_V_RESMPL_KV_NORM, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL}},
1585+
{LLM_TENSOR_V_RESMPL_POST_NORM, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL}},
1586+
{LLM_TENSOR_V_RESMPL_Q_NORM, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL}},
1587+
{LLM_TENSOR_V_RESMPL_PROJ, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
1588+
{LLM_TENSOR_V_RESMPL_QUERY, {LLM_TENSOR_LAYER_PROJECTION, GGML_OP_MUL_MAT}},
1589+
// special token embeddings for image
1590+
{LLM_TENSOR_V_TOK_EMBD_IMAGE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_CONCAT}},
1591+
{LLM_TENSOR_V_TOK_EMBD_END_IMAGE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_CONCAT}},
1592+
{LLM_TENSOR_V_TOK_EMBD_SLICE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_CONCAT}},
1593+
{LLM_TENSOR_V_TOK_EMBD_END_SLICE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_CONCAT}},
15791594
};
15801595

15811596
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}

src/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,7 @@ enum llm_tensor {
393393
enum llm_tensor_layer {
394394
LLM_TENSOR_LAYER_INPUT,
395395
LLM_TENSOR_LAYER_REPEATING,
396+
LLM_TENSOR_LAYER_PROJECTION,
396397
LLM_TENSOR_LAYER_OUTPUT,
397398
};
398399

0 commit comments

Comments
 (0)