@@ -1248,6 +1248,9 @@ struct llama_layer {
1248
1248
struct ggml_tensor * wqkv;
1249
1249
1250
1250
// attention bias
1251
+ struct ggml_tensor * bq;
1252
+ struct ggml_tensor * bk;
1253
+ struct ggml_tensor * bv;
1251
1254
struct ggml_tensor * bo;
1252
1255
struct ggml_tensor * bqkv;
1253
1256
@@ -2781,6 +2784,11 @@ static void llm_load_tensors(
2781
2784
layer.wk = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_K, " weight" , i), {n_embd, n_embd_gqa}, backend_split);
2782
2785
layer.wv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_V, " weight" , i), {n_embd, n_embd_gqa}, backend_split);
2783
2786
layer.wo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd, n_embd}, backend_split);
2787
+
2788
+ layer.bq = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_Q, " bias" , i), {n_embd}, backend);
2789
+ layer.bk = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_K, " bias" , i), {n_embd_gqa}, backend);
2790
+ layer.bv = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_V, " bias" , i), {n_embd_gqa}, backend);
2791
+ layer.bo = ml.create_tensor (ctx, tn (LLM_TENSOR_ATTN_OUT, " bias" , i), {n_embd}, backend);
2784
2792
2785
2793
layer.ffn_norm = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd}, backend);
2786
2794
@@ -2791,8 +2799,9 @@ static void llm_load_tensors(
2791
2799
if (backend == GGML_BACKEND_GPU) {
2792
2800
vram_weights +=
2793
2801
ggml_nbytes (layer.attn_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
2794
- ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.ffn_norm ) +
2795
- ggml_nbytes (layer.ffn_gate ) + ggml_nbytes (layer.ffn_down ) + ggml_nbytes (layer.ffn_up );
2802
+ ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.bq ) +
2803
+ ggml_nbytes (layer.bk ) + ggml_nbytes (layer.bv ) + ggml_nbytes (layer.bo ) +
2804
+ ggml_nbytes (layer.ffn_norm ) + ggml_nbytes (layer.ffn_gate ) + ggml_nbytes (layer.ffn_down ) + ggml_nbytes (layer.ffn_up );
2796
2805
}
2797
2806
}
2798
2807
} break ;
@@ -3891,13 +3900,25 @@ struct llm_build_context {
3891
3900
// compute Q and K and RoPE them
3892
3901
struct ggml_tensor * Qcur = ggml_mul_mat (ctx0, model.layers [il].wq , cur);
3893
3902
cb (Qcur, " Qcur" , il);
3903
+ if (model.layers [il].bq ) {
3904
+ Qcur = ggml_add (ctx0, Qcur, model.layers [il].bq );
3905
+ cb (Qcur, " Qcur" , il);
3906
+ }
3894
3907
3895
3908
struct ggml_tensor * Kcur = ggml_mul_mat (ctx0, model.layers [il].wk , cur);
3896
3909
cb (Kcur, " Kcur" , il);
3910
+ if (model.layers [il].bk ) {
3911
+ Kcur = ggml_add (ctx0, Kcur, model.layers [il].bk );
3912
+ cb (Kcur, " Kcur" , il);
3913
+ }
3897
3914
3898
3915
struct ggml_tensor * Vcur = ggml_mul_mat (ctx0, model.layers [il].wv , cur);
3899
3916
cb (Vcur, " Vcur" , il);
3900
-
3917
+ if (model.layers [il].bv ) {
3918
+ Vcur = ggml_add (ctx0, Vcur, model.layers [il].bv );
3919
+ cb (Vcur, " Vcur" , il);
3920
+ }
3921
+
3901
3922
Qcur = ggml_rope_custom (
3902
3923
ctx0, ggml_reshape_3d (ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
3903
3924
n_embd_head, 0 , 0 , n_orig_ctx, freq_base, freq_scale,
@@ -3915,7 +3936,7 @@ struct llm_build_context {
3915
3936
llm_build_kv_store (ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
3916
3937
3917
3938
cur = llm_build_kqv (ctx0, hparams, kv_self,
3918
- model.layers [il].wo , NULL ,
3939
+ model.layers [il].wo , model. layers [il]. bo ,
3919
3940
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1 .0f , cb, il);
3920
3941
cb (cur, " kqv_out" , il);
3921
3942
}
0 commit comments