Skip to content

Commit ec13f29

Browse files
committed
feat(llama.cpp): First pass at full port of granite deviations from llama
Something is still not working right since the results are mostly terrible, but on occasion it's producing relevant results at this point, so _something_ is working. Branch: GraniteLM Signed-off-by: Gabe Goodhart <[email protected]>
1 parent 383065a commit ec13f29

File tree

1 file changed

+42
-1
lines changed

1 file changed

+42
-1
lines changed

src/llama.cpp

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1462,6 +1462,22 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
14621462
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
14631463
},
14641464
},
1465+
{
1466+
LLM_ARCH_GRANITE,
1467+
{
1468+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1469+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1470+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1471+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1472+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1473+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1474+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1475+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1476+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1477+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1478+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1479+
},
1480+
},
14651481
{
14661482
LLM_ARCH_UNKNOWN,
14671483
{
@@ -6915,6 +6931,7 @@ static bool llm_load_tensors(
69156931
case LLM_ARCH_LLAMA:
69166932
case LLM_ARCH_REFACT:
69176933
case LLM_ARCH_MINICPM:
6934+
case LLM_ARCH_GRANITE:
69186935
{
69196936
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
69206937

@@ -8898,6 +8915,11 @@ static struct ggml_tensor * llm_build_inp_embd(
88988915
ggml_set_input(lctx.inp_embd);
88998916
}
89008917

8918+
// For Granite architecture
8919+
if (hparams.f_embedding_multiplier != 0.0f) {
8920+
inpL = ggml_scale(ctx, inpL, hparams.f_embedding_multiplier);
8921+
}
8922+
89018923
cb(inpL, "inp_embd", -1);
89028924

89038925
return inpL;
@@ -10176,6 +10198,7 @@ struct llm_build_context {
1017610198
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
1017710199
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
1017810200

10201+
const float kq_scale = hparams.f_attention_multiplier == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_multiplier;
1017910202
for (int il = 0; il < n_layer; ++il) {
1018010203
struct ggml_tensor * inpSA = inpL;
1018110204

@@ -10228,7 +10251,7 @@ struct llm_build_context {
1022810251

1022910252
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
1023010253
model.layers[il].wo, model.layers[il].bo,
10231-
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
10254+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
1023210255
}
1023310256

1023410257
if (il == n_layer - 1) {
@@ -10239,6 +10262,11 @@ struct llm_build_context {
1023910262
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
1024010263
}
1024110264

10265+
// For Granite architecture
10266+
if (hparams.f_residual_multiplier) {
10267+
cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier);
10268+
}
10269+
1024210270
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
1024310271
cb(ffn_inp, "ffn_inp", il);
1024410272

@@ -10275,6 +10303,11 @@ struct llm_build_context {
1027510303
cb(cur, "ffn_moe_out", il);
1027610304
}
1027710305

10306+
// For Granite architecture
10307+
if (hparams.f_residual_multiplier) {
10308+
cur = ggml_scale(ctx0, cur, hparams.f_residual_multiplier);
10309+
}
10310+
1027810311
cur = ggml_add(ctx0, cur, ffn_inp);
1027910312
cb(cur, "ffn_out", il);
1028010313

@@ -10294,6 +10327,12 @@ struct llm_build_context {
1029410327

1029510328
// lm_head
1029610329
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
10330+
10331+
// For Granite architecture
10332+
if (hparams.f_logit_scale) {
10333+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
10334+
}
10335+
1029710336
cb(cur, "result_output", -1);
1029810337

1029910338
ggml_build_forward_expand(gf, cur);
@@ -15819,6 +15858,7 @@ static struct ggml_cgraph * llama_build_graph(
1581915858

1582015859
switch (model.arch) {
1582115860
case LLM_ARCH_LLAMA:
15861+
case LLM_ARCH_GRANITE:
1582215862
{
1582315863
result = llm.build_llama();
1582415864
} break;
@@ -19115,6 +19155,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
1911519155
case LLM_ARCH_ARCTIC:
1911619156
case LLM_ARCH_DEEPSEEK2:
1911719157
case LLM_ARCH_CHATGLM:
19158+
case LLM_ARCH_GRANITE:
1911819159
return LLAMA_ROPE_TYPE_NORM;
1911919160

1912019161
// the pairs of head values are offset by n_rot/2

0 commit comments

Comments
 (0)