@@ -594,6 +594,9 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
594
594
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
595
595
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
596
596
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
597
+ { LLM_TENSOR_POS_EMBD, "position_embd" },
598
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
599
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
597
600
},
598
601
},
599
602
{
@@ -4867,6 +4870,7 @@ static bool llm_load_tensors(
4867
4870
case LLM_ARCH_MPT:
4868
4871
{
4869
4872
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
4873
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false);
4870
4874
4871
4875
// output
4872
4876
{
@@ -4905,6 +4909,12 @@ static bool llm_load_tensors(
4905
4909
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4906
4910
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
4907
4911
4912
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false);
4913
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false);
4914
+
4915
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false);
4916
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false);
4917
+
4908
4918
// AWQ ScaleActivation layer
4909
4919
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
4910
4920
}
@@ -7721,6 +7731,7 @@ struct llm_build_context {
7721
7731
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7722
7732
7723
7733
struct ggml_tensor * cur;
7734
+ struct ggml_tensor * pos;
7724
7735
struct ggml_tensor * inpL;
7725
7736
7726
7737
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@@ -7731,6 +7742,16 @@ struct llm_build_context {
7731
7742
// positions of the tokens in the KV cache
7732
7743
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
7733
7744
7745
+ if (model.pos_embd) {
7746
+ // inp_pos - contains the positions
7747
+ struct ggml_tensor * inp_pos = build_inp_pos();
7748
+ pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
7749
+ cb(pos, "pos_embd", -1);
7750
+
7751
+ inpL = ggml_add(ctx0, inpL, pos);
7752
+ cb(inpL, "inpL", -1);
7753
+ }
7754
+
7734
7755
for (int il = 0; il < n_layer; ++il) {
7735
7756
struct ggml_tensor * attn_norm;
7736
7757
@@ -7765,11 +7786,32 @@ struct llm_build_context {
7765
7786
cb(Kcur, "Kcur", il);
7766
7787
cb(Vcur, "Vcur", il);
7767
7788
7768
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7789
+ // Q/K Layernorm
7790
+ if (model.layers[il].attn_q_norm) {
7791
+ Qcur = llm_build_norm(ctx0, Qcur, hparams,
7792
+ model.layers[il].attn_q_norm,
7793
+ model.layers[il].attn_q_norm_b,
7794
+ LLM_NORM, cb, il);
7795
+ cb(Qcur, "Qcur", il);
7769
7796
7770
- cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7797
+ Kcur = llm_build_norm(ctx0, Kcur, hparams,
7798
+ model.layers[il].attn_k_norm,
7799
+ model.layers[il].attn_k_norm_b,
7800
+ LLM_NORM, cb, il);
7801
+ cb(Kcur, "Kcur", il);
7802
+
7803
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7804
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
7805
+
7806
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7771
7807
model.layers[il].wo, model.layers[il].bo,
7772
- Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7808
+ Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7809
+ } else {
7810
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
7811
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
7812
+ model.layers[il].wo, model.layers[il].bo,
7813
+ Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
7814
+ }
7773
7815
}
7774
7816
7775
7817
if (il == n_layer - 1) {
0 commit comments