Skip to content

Commit e86b7ea

Browse files
committed
cgraph ok, just missing 2D RoPE
1 parent 44cd468 commit e86b7ea

File tree

6 files changed

+304
-13
lines changed

6 files changed

+304
-13
lines changed

convert_hf_to_gguf.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1898,6 +1898,55 @@ def prepare_tensors(self):
18981898
raise ValueError(f"Unprocessed experts: {experts}")
18991899

19001900

1901+
@ModelBase.register("LlavaForConditionalGeneration")
1902+
class LlavaVisionModel(VisionModel):
1903+
img_break_tok_id = -1
1904+
1905+
def __init__(self, *args, **kwargs):
1906+
super().__init__(*args, **kwargs)
1907+
if self.hparams["model_type"] == "pixtral":
1908+
# fix missing config.json values
1909+
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
1910+
self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 24)
1911+
self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 4096)
1912+
self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1024)
1913+
self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
1914+
self.img_break_tok_id = 12 # see tokenizer_config.json
1915+
else:
1916+
raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
1917+
1918+
def set_gguf_parameters(self):
1919+
super().set_gguf_parameters()
1920+
hparams = self.hparams
1921+
if hparams["model_type"] == "pixtral":
1922+
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL)
1923+
# default values below are taken from HF tranformers code
1924+
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
1925+
self.gguf_writer.add_vision_use_silu(True)
1926+
1927+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1928+
del bid # unused
1929+
n_head = self.hparams["num_attention_heads"]
1930+
n_kv_head = n_head
1931+
1932+
if name.startswith("multi_modal_projector.") or name.startswith("vision_tower."):
1933+
# process vision tensors
1934+
if name.endswith(("q_proj.weight", "q_proj.bias")):
1935+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1936+
if name.endswith(("k_proj.weight", "k_proj.bias")):
1937+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1938+
return [(self.map_tensor_name(name), data_torch)]
1939+
1940+
if self.img_break_tok_id > 0 and "embed_tokens.weight" in name:
1941+
logger.info(f"Extracting [IMG_BREAK] token embedding from {name}")
1942+
# for pixtral model, we need to extract the [IMG_BREAK] token embedding
1943+
img_break_embd = data_torch[self.img_break_tok_id]
1944+
name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK]
1945+
return [(self.map_tensor_name(name), img_break_embd)]
1946+
1947+
return [] # skip other tensors
1948+
1949+
19011950
@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
19021951
class SmolVLMModel(VisionModel):
19031952
def __init__(self, *args, **kwargs):

examples/llava/clip-impl.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
6161
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
6262
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
63+
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
6364
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
6465
#define TN_LN_1 "%s.blk.%d.ln1.%s"
6566
#define TN_LN_2 "%s.blk.%d.ln2.%s"
@@ -73,6 +74,7 @@
7374
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
7475
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
7576
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
77+
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
7678

7779
// mimicpmv
7880
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
@@ -101,6 +103,7 @@ enum projector_type {
101103
PROJECTOR_TYPE_MERGER,
102104
PROJECTOR_TYPE_GEMMA3,
103105
PROJECTOR_TYPE_IDEFICS3,
106+
PROJECTOR_TYPE_PIXTRAL,
104107
PROJECTOR_TYPE_UNKNOWN,
105108
};
106109

@@ -112,7 +115,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
112115
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
113116
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
114117
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
115-
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
118+
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
116119
};
117120

118121
static projector_type clip_projector_type_from_string(const std::string & str) {

0 commit comments

Comments
 (0)