Skip to content

Commit 0e9d1a7

Browse files
committed
feat(llama.cpp): First pass at full port of granite deviations from llama
Something is still not working right since the results are mostly terrible, but on occasion it's producing relevant results at this point, so _something_ is working. Branch: GraniteLM Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 65afc73 commit 0e9d1a7

File tree

1 file changed

+42
-1
lines changed

1 file changed

+42
-1
lines changed

src/llama.cpp

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1416,6 +1416,22 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
14161416
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
14171417
},
14181418
},
1419+
{
1420+
LLM_ARCH_GRANITE,
1421+
{
1422+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1423+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1424+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1425+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1426+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1427+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1428+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1429+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1430+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1431+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1432+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1433+
},
1434+
},
14191435
{
14201436
LLM_ARCH_UNKNOWN,
14211437
{
@@ -6832,6 +6848,7 @@ static bool llm_load_tensors(
68326848
case LLM_ARCH_LLAMA:
68336849
case LLM_ARCH_REFACT:
68346850
case LLM_ARCH_MINICPM:
6851+
case LLM_ARCH_GRANITE:
68356852
{
68366853
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
68376854

@@ -8726,6 +8743,11 @@ static struct ggml_tensor * llm_build_inp_embd(
87268743
ggml_set_input(lctx.inp_embd);
87278744
}
87288745

8746+
// For Granite architecture
8747+
if (hparams.f_embedding_multiplier != 0.0f) {
8748+
inpL = ggml_scale(ctx, inpL, hparams.f_embedding_multiplier);
8749+
}
8750+
87298751
cb(inpL, "inp_embd", -1);
87308752

87318753
return inpL;
@@ -10004,6 +10026,7 @@ struct llm_build_context {
1000410026
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
1000510027
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
1000610028

10029+
const float kq_scale = hparams.f_attention_multiplier == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_multiplier;
1000710030
for (int il = 0; il < n_layer; ++il) {
1000810031
struct ggml_tensor * inpSA = inpL;
1000910032

@@ -10056,7 +10079,7 @@ struct llm_build_context {
1005610079

1005710080
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
1005810081
model.layers[il].wo, model.layers[il].bo,
10059-
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10082+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
1006010083
}
1006110084

1006210085
if (il == n_layer - 1) {
@@ -10067,6 +10090,11 @@ struct llm_build_context {
1006710090
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
1006810091
}
1006910092

10093+
// For Granite architecture
10094+
if (hparams.f_residual_multiplier) {
10095+
cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier);
10096+
}
10097+
1007010098
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
1007110099
cb(ffn_inp, "ffn_inp", il);
1007210100

@@ -10103,6 +10131,11 @@ struct llm_build_context {
1010310131
cb(cur, "ffn_moe_out", il);
1010410132
}
1010510133

10134+
// For Granite architecture
10135+
if (hparams.f_residual_multiplier) {
10136+
cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier);
10137+
}
10138+
1010610139
cur = ggml_add(ctx0, cur, ffn_inp);
1010710140
cb(cur, "ffn_out", il);
1010810141

@@ -10122,6 +10155,12 @@ struct llm_build_context {
1012210155

1012310156
// lm_head
1012410157
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
10158+
10159+
// For Granite architecture
10160+
if (hparams.f_logit_scale) {
10161+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
10162+
}
10163+
1012510164
cb(cur, "result_output", -1);
1012610165

1012710166
ggml_build_forward_expand(gf, cur);
@@ -15300,6 +15339,7 @@ static struct ggml_cgraph * llama_build_graph(
1530015339

1530115340
switch (model.arch) {
1530215341
case LLM_ARCH_LLAMA:
15342+
case LLM_ARCH_GRANITE:
1530315343
{
1530415344
result = llm.build_llama();
1530515345
} break;
@@ -18540,6 +18580,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
1854018580
case LLM_ARCH_ARCTIC:
1854118581
case LLM_ARCH_DEEPSEEK2:
1854218582
case LLM_ARCH_CHATGLM:
18583+
case LLM_ARCH_GRANITE:
1854318584
return LLAMA_ROPE_TYPE_NORM;
1854418585

1854518586
// the pairs of head values are offset by n_rot/2

0 commit comments

Comments
 (0)