@@ -4054,6 +4054,8 @@ static bool llm_load_tensors(
4054
4054
// output
4055
4055
{
4056
4056
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
4057
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
4058
+
4057
4059
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
4058
4060
}
4059
4061
@@ -4063,14 +4065,23 @@ static bool llm_load_tensors(
4063
4065
4064
4066
auto & layer = model.layers[i];
4065
4067
4066
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4068
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
4069
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, false);
4067
4070
4068
4071
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
4072
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
4073
+
4069
4074
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4075
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
4070
4076
4071
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4072
- layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4073
- layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4077
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4078
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false);
4079
+
4080
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
4081
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
4082
+
4083
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4084
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
4074
4085
4075
4086
// AWQ ScaleActivation layer
4076
4087
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
@@ -6171,7 +6182,7 @@ struct llm_build_context {
6171
6182
6172
6183
attn_norm = llm_build_norm(ctx0, inpL, hparams,
6173
6184
model.layers[il].attn_norm,
6174
- NULL ,
6185
+ model.layers[il].attn_norm_b ,
6175
6186
LLM_NORM, cb, il);
6176
6187
cb(attn_norm, "attn_norm", il);
6177
6188
@@ -6181,6 +6192,11 @@ struct llm_build_context {
6181
6192
6182
6193
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
6183
6194
cb(cur, "wqkv", il);
6195
+
6196
+ if (model.layers[il].bqkv){
6197
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
6198
+ cb(cur, "bqkv", il);
6199
+ }
6184
6200
6185
6201
if (hparams.f_clamp_kqv > 0.0f) {
6186
6202
cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
@@ -6198,7 +6214,7 @@ struct llm_build_context {
6198
6214
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
6199
6215
6200
6216
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
6201
- model.layers[il].wo, NULL ,
6217
+ model.layers[il].wo, model.layers[il].bo ,
6202
6218
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
6203
6219
cb(cur, "kqv_out", il);
6204
6220
}
@@ -6211,13 +6227,13 @@ struct llm_build_context {
6211
6227
{
6212
6228
cur = llm_build_norm(ctx0, ffn_inp, hparams,
6213
6229
model.layers[il].ffn_norm,
6214
- NULL ,
6230
+ model.layers[il].ffn_norm_b ,
6215
6231
LLM_NORM, cb, il);
6216
6232
cb(cur, "ffn_norm", il);
6217
6233
cur = llm_build_ffn(ctx0, cur,
6218
- model.layers[il].ffn_up, NULL ,
6234
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b ,
6219
6235
NULL, NULL,
6220
- model.layers[il].ffn_down, NULL ,
6236
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b ,
6221
6237
model.layers[il].ffn_act,
6222
6238
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
6223
6239
cb(cur, "ffn_out", il);
@@ -6234,7 +6250,7 @@ struct llm_build_context {
6234
6250
6235
6251
cur = llm_build_norm(ctx0, cur, hparams,
6236
6252
model.output_norm,
6237
- NULL ,
6253
+ model.output_norm_b ,
6238
6254
LLM_NORM, cb, -1);
6239
6255
cb(cur, "result_norm", -1);
6240
6256
0 commit comments