Skip to content

Commit 2ec91ee

Browse files
committed
feat(llama.cpp): First pass at full port of granite deviations from llama
Something is still not working right since the results are mostly terrible, but on occasion it's producing relevant results at this point, so _something_ is working. Branch: GraniteLM Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 009a2b4 commit 2ec91ee

File tree

1 file changed

+42
-1
lines changed

1 file changed

+42
-1
lines changed

src/llama.cpp

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1415,6 +1415,22 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
14151415
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
14161416
},
14171417
},
1418+
{
1419+
LLM_ARCH_GRANITE,
1420+
{
1421+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1422+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1423+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1424+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1425+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1426+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1427+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1428+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1429+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1430+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1431+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1432+
},
1433+
},
14181434
{
14191435
LLM_ARCH_UNKNOWN,
14201436
{
@@ -6844,6 +6860,7 @@ static bool llm_load_tensors(
68446860
case LLM_ARCH_LLAMA:
68456861
case LLM_ARCH_REFACT:
68466862
case LLM_ARCH_MINICPM:
6863+
case LLM_ARCH_GRANITE:
68476864
{
68486865
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
68496866

@@ -8738,6 +8755,11 @@ static struct ggml_tensor * llm_build_inp_embd(
87388755
ggml_set_input(lctx.inp_embd);
87398756
}
87408757

8758+
// For Granite architecture
8759+
if (hparams.f_embedding_multiplier != 0.0f) {
8760+
inpL = ggml_scale(ctx, inpL, hparams.f_embedding_multiplier);
8761+
}
8762+
87418763
cb(inpL, "inp_embd", -1);
87428764

87438765
return inpL;
@@ -10016,6 +10038,7 @@ struct llm_build_context {
1001610038
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
1001710039
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
1001810040

10041+
const float kq_scale = hparams.f_attention_multiplier == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_multiplier;
1001910042
for (int il = 0; il < n_layer; ++il) {
1002010043
struct ggml_tensor * inpSA = inpL;
1002110044

@@ -10068,7 +10091,7 @@ struct llm_build_context {
1006810091

1006910092
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
1007010093
model.layers[il].wo, model.layers[il].bo,
10071-
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10094+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
1007210095
}
1007310096

1007410097
if (il == n_layer - 1) {
@@ -10079,6 +10102,11 @@ struct llm_build_context {
1007910102
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
1008010103
}
1008110104

10105+
// For Granite architecture
10106+
if (hparams.f_residual_multiplier) {
10107+
cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier);
10108+
}
10109+
1008210110
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
1008310111
cb(ffn_inp, "ffn_inp", il);
1008410112

@@ -10115,6 +10143,11 @@ struct llm_build_context {
1011510143
cb(cur, "ffn_moe_out", il);
1011610144
}
1011710145

10146+
// For Granite architecture
10147+
if (hparams.f_residual_multiplier) {
10148+
cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier);
10149+
}
10150+
1011810151
cur = ggml_add(ctx0, cur, ffn_inp);
1011910152
cb(cur, "ffn_out", il);
1012010153

@@ -10134,6 +10167,12 @@ struct llm_build_context {
1013410167

1013510168
// lm_head
1013610169
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
10170+
10171+
// For Granite architecture
10172+
if (hparams.f_logit_scale) {
10173+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
10174+
}
10175+
1013710176
cb(cur, "result_output", -1);
1013810177

1013910178
ggml_build_forward_expand(gf, cur);
@@ -15322,6 +15361,7 @@ static struct ggml_cgraph * llama_build_graph(
1532215361

1532315362
switch (model.arch) {
1532415363
case LLM_ARCH_LLAMA:
15364+
case LLM_ARCH_GRANITE:
1532515365
{
1532615366
result = llm.build_llama();
1532715367
} break;
@@ -18608,6 +18648,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
1860818648
case LLM_ARCH_ARCTIC:
1860918649
case LLM_ARCH_DEEPSEEK2:
1861018650
case LLM_ARCH_CHATGLM:
18651+
case LLM_ARCH_GRANITE:
1861118652
return LLAMA_ROPE_TYPE_NORM;
1861218653

1861318654
// the pairs of head values are offset by n_rot/2

0 commit comments

Comments
 (0)