Skip to content

Commit 5442939

Browse files
giuseppesroecker
andauthored
llama : support small Granite models (#7481)
* Add optional MLP bias for Granite models Add optional MLP bias for ARCH_LLAMA to support Granite models. Partially addresses /issues/7116 Still needs some more changes to properly support Granite. * llama: honor add_space_prefix from the model configuration propagate the add_space_prefix configuration from the HF model configuration to the gguf file and honor it with the gpt2 tokenizer. Signed-off-by: Giuseppe Scrivano <[email protected]> * llama: add support for small granite models it works only for the small models 3b and 8b. The convert-hf-to-gguf.py script uses the vocabulary size of the granite models to detect granite and set the correct configuration. Signed-off-by: Giuseppe Scrivano <[email protected]> --------- Signed-off-by: Giuseppe Scrivano <[email protected]> Co-authored-by: Steffen Roecker <[email protected]>
1 parent 56411a9 commit 5442939

File tree

2 files changed

+34
-8
lines changed

2 files changed

+34
-8
lines changed

convert-hf-to-gguf.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1317,6 +1317,17 @@ def set_gguf_parameters(self):
13171317
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
13181318
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
13191319

1320+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1321+
if tokenizer_config_file.is_file():
1322+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1323+
tokenizer_config_json = json.load(f)
1324+
if "add_prefix_space" in tokenizer_config_json:
1325+
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
1326+
1327+
# Apply to granite small models only
1328+
if self.hparams.get("vocab_size", 32000) == 49152:
1329+
self.gguf_writer.add_add_bos_token(False)
1330+
13201331
@staticmethod
13211332
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
13221333
if n_head_kv is not None and n_head != n_head_kv:
@@ -1331,9 +1342,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
13311342
n_head = self.hparams["num_attention_heads"]
13321343
n_kv_head = self.hparams.get("num_key_value_heads")
13331344

1334-
if name.endswith("q_proj.weight"):
1345+
if name.endswith(("q_proj.weight", "q_proj.bias")):
13351346
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1336-
if name.endswith("k_proj.weight"):
1347+
if name.endswith(("k_proj.weight", "k_proj.bias")):
13371348
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
13381349

13391350
# process the experts separately

llama.cpp

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2028,8 +2028,9 @@ struct llama_layer {
20282028
struct ggml_tensor * ffn_up_shexp;
20292029

20302030
// ff bias
2031-
struct ggml_tensor * ffn_down_b; // b2
2032-
struct ggml_tensor * ffn_up_b; // b3
2031+
struct ggml_tensor * ffn_gate_b = nullptr;
2032+
struct ggml_tensor * ffn_down_b = nullptr; // b2
2033+
struct ggml_tensor * ffn_up_b = nullptr; // b3
20332034
struct ggml_tensor * ffn_act;
20342035

20352036
// mamba proj
@@ -4058,7 +4059,9 @@ static void llm_load_hparams(
40584059
switch (hparams.n_layer) {
40594060
case 22: model.type = e_model::MODEL_1B; break;
40604061
case 26: model.type = e_model::MODEL_3B; break;
4061-
case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
4062+
// granite uses a vocab with len 49152
4063+
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
4064+
case 36: model.type = e_model::MODEL_8B; break; // granite
40624065
case 40: model.type = e_model::MODEL_13B; break;
40634066
case 48: model.type = e_model::MODEL_34B; break;
40644067
case 60: model.type = e_model::MODEL_30B; break;
@@ -4328,6 +4331,8 @@ static void llm_load_hparams(
43284331
case 30: model.type = e_model::MODEL_3B; break;
43294332
case 32: model.type = e_model::MODEL_7B; break;
43304333
case 40: model.type = e_model::MODEL_15B; break;
4334+
case 52: model.type = e_model::MODEL_20B; break; // granite
4335+
case 88: model.type = e_model::MODEL_34B; break; // granite
43314336
default: model.type = e_model::MODEL_UNKNOWN;
43324337
}
43334338
} break;
@@ -4590,6 +4595,11 @@ static void llm_load_vocab(
45904595
} else {
45914596
if (tokenizer_model == "gpt2") {
45924597
vocab.type = LLAMA_VOCAB_TYPE_BPE;
4598+
4599+
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4600+
if (add_space_prefix_keyidx != -1) {
4601+
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4602+
}
45934603
} else {
45944604
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
45954605
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
@@ -5211,6 +5221,11 @@ static bool llm_load_tensors(
52115221
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
52125222
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
52135223
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5224+
5225+
// optional MLP bias
5226+
layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5227+
layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5228+
layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
52145229
} else {
52155230
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
52165231

@@ -7483,9 +7498,9 @@ struct llm_build_context {
74837498
cb(cur, "ffn_norm", il);
74847499

74857500
cur = llm_build_ffn(ctx0, cur,
7486-
model.layers[il].ffn_up, NULL,
7487-
model.layers[il].ffn_gate, NULL,
7488-
model.layers[il].ffn_down, NULL,
7501+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
7502+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b,
7503+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
74897504
NULL,
74907505
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
74917506
cb(cur, "ffn_out", il);

0 commit comments

Comments
 (0)