Skip to content

Commit 272935b

Browse files
llava : add MobileVLM_V2 backup (#6175)
* Add MobileVLM_V2 backup * Update MobileVLM-README.md * Update examples/llava/MobileVLM-README.md Co-authored-by: Georgi Gerganov <[email protected]> * Update examples/llava/convert-image-encoder-to-gguf.py Co-authored-by: Georgi Gerganov <[email protected]> * clip : fix whitespace * fix deifinition mistake in clip.cpp --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent ccf58aa commit 272935b

File tree

3 files changed

+67
-6
lines changed

3 files changed

+67
-6
lines changed

examples/llava/MobileVLM-README.md

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
# MobileVLM
22

3-
Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
3+
Currently this implementation supports [MobileVLM-1.7B](https://huggingface.co/mtgv/MobileVLM-1.7B) / [MobileVLM_V2-1.7B](https://huggingface.co/mtgv/MobileVLM_V2-1.7B) variants.
44

55
for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
66

77
The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
88

9+
Notice: The overall process of model inference for both **MobilVLM** and **MobilVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown.
10+
911
## Usage
1012
Build with cmake or run `make llava-cli` to build it.
1113

@@ -34,7 +36,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
3436
python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
3537
```
3638

37-
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
39+
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** the arg is `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
3840

3941
```sh
4042
python ./examples/llava/convert-image-encoder-to-gguf \
@@ -44,6 +46,14 @@ python ./examples/llava/convert-image-encoder-to-gguf \
4446
--projector-type ldp
4547
```
4648

49+
```sh
50+
python ./examples/llava/convert-image-encoder-to-gguf \
51+
-m path/to/clip-vit-large-patch14-336 \
52+
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
53+
--output-dir path/to/MobileVLM-1.7B_V2 \
54+
--projector-type ldpv2
55+
```
56+
4757
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
4858

4959
```sh

examples/llava/clip.cpp

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,19 +119,22 @@ static std::string format(const char * fmt, ...) {
119119
#define TN_LLAVA_PROJ "mm.%d.%s"
120120
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
121121
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
122+
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
122123
#define TN_IMAGE_NEWLINE "model.image_newline"
123124

124125

125126
enum projector_type {
126127
PROJECTOR_TYPE_MLP,
127128
PROJECTOR_TYPE_MLP_NORM,
128129
PROJECTOR_TYPE_LDP,
130+
PROJECTOR_TYPE_LDPV2,
129131
PROJECTOR_TYPE_UNKNOWN,
130132
};
131133

132134
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
133135
{ PROJECTOR_TYPE_MLP, "mlp" },
134136
{ PROJECTOR_TYPE_LDP, "ldp" },
137+
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
135138
};
136139

137140

@@ -475,6 +478,14 @@ struct clip_vision_model {
475478
struct ggml_tensor * mm_model_block_2_block_2_0_w;
476479
struct ggml_tensor * mm_model_block_2_block_2_1_w;
477480
struct ggml_tensor * mm_model_block_2_block_2_1_b;
481+
482+
// MobileVLM_V2 projection
483+
struct ggml_tensor * mm_model_mlp_0_w;
484+
struct ggml_tensor * mm_model_mlp_0_b;
485+
struct ggml_tensor * mm_model_mlp_2_w;
486+
struct ggml_tensor * mm_model_mlp_2_b;
487+
struct ggml_tensor * mm_model_peg_0_w;
488+
struct ggml_tensor * mm_model_peg_0_b;
478489
};
479490

480491
struct clip_ctx {
@@ -807,6 +818,29 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
807818
}
808819
embeddings = block_1;
809820
}
821+
else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
822+
{
823+
int n_patch = 24;
824+
struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
825+
mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
826+
mlp_0 = ggml_gelu(ctx0, mlp_0);
827+
struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
828+
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
829+
// mlp_2 ne = [2048, 576, 1, 1]
830+
// // AVG Pool Layer 2*2, strides = 2
831+
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
832+
// mlp_2 ne = [576, 2048, 1, 1]
833+
mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
834+
// mlp_2 ne [24, 24, 2048, 1]
835+
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
836+
// weight ne = [3, 3, 2048, 1]
837+
struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
838+
peg_0 = ggml_add(ctx0, peg_0, mlp_2);
839+
peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
840+
peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
841+
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
842+
embeddings = peg_0;
843+
}
810844
else {
811845
GGML_ASSERT(false);
812846
}
@@ -1177,7 +1211,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
11771211
vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
11781212
vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
11791213
vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
1180-
} else {
1214+
}
1215+
else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2)
1216+
{
1217+
// MobilVLM_V2 projection
1218+
vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight"));
1219+
vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias"));
1220+
vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight"));
1221+
vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias"));
1222+
vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
1223+
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
1224+
}
1225+
else {
11811226
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
11821227
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
11831228
}
@@ -1966,6 +2011,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
19662011
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
19672012
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
19682013
}
2014+
if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
2015+
return ctx->vision_model.mm_model_peg_0_b->ne[0];
2016+
}
19692017
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
19702018
return ctx->vision_model.mm_2_b->ne[0];
19712019
}

examples/llava/convert-image-encoder-to-gguf.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import argparse
22
import os
33
import json
4+
import re
45

56
import torch
67
import numpy as np
@@ -38,9 +39,11 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b
3839
def get_tensor_name(name: str) -> str:
3940
if "projection" in name:
4041
return name
41-
4242
if "mm_projector" in name:
43-
return name.replace("model.mm_projector", "mm")
43+
name = name.replace("model.mm_projector", "mm")
44+
name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
45+
name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
46+
return name
4447

4548
return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
4649

@@ -83,7 +86,7 @@ def bytes_to_unicode():
8386
ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
8487
help="The clip model is from openclip (for ViT-SO400M type))")
8588
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
86-
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
89+
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
8790
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
8891
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
8992
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5

0 commit comments

Comments
 (0)