@@ -1953,8 +1953,9 @@ struct llama_layer {
1953
1953
struct ggml_tensor * ffn_up_shexp;
1954
1954
1955
1955
// ff bias
1956
- struct ggml_tensor * ffn_down_b; // b2
1957
- struct ggml_tensor * ffn_up_b; // b3
1956
+ struct ggml_tensor * ffn_gate_b = nullptr;
1957
+ struct ggml_tensor * ffn_down_b = nullptr; // b2
1958
+ struct ggml_tensor * ffn_up_b = nullptr; // b3
1958
1959
struct ggml_tensor * ffn_act;
1959
1960
1960
1961
// mamba proj
@@ -5103,6 +5104,11 @@ static bool llm_load_tensors(
5103
5104
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5104
5105
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5105
5106
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5107
+
5108
+ // optional MLP bias
5109
+ layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5110
+ layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
5111
+ layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
5106
5112
} else {
5107
5113
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5108
5114
@@ -7305,9 +7311,9 @@ struct llm_build_context {
7305
7311
cb(cur, "ffn_norm", il);
7306
7312
7307
7313
cur = llm_build_ffn(ctx0, cur,
7308
- model.layers[il].ffn_up, NULL ,
7309
- model.layers[il].ffn_gate, NULL ,
7310
- model.layers[il].ffn_down, NULL ,
7314
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b ,
7315
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b ,
7316
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b ,
7311
7317
NULL,
7312
7318
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
7313
7319
cb(cur, "ffn_out", il);
0 commit comments