@@ -1922,6 +1922,7 @@ struct llama_layer {
1922
1922
struct ggml_tensor * ffn_up_shexp;
1923
1923
1924
1924
// ff bias
1925
+ struct ggml_tensor * ffn_gate_b;
1925
1926
struct ggml_tensor * ffn_down_b; // b2
1926
1927
struct ggml_tensor * ffn_up_b; // b3
1927
1928
struct ggml_tensor * ffn_act;
@@ -5006,6 +5007,11 @@ static bool llm_load_tensors(
5006
5007
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
5007
5008
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
5008
5009
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
5010
+
5011
+ // optional MLP bias
5012
+ layer.ffn_gate_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, false);
5013
+ layer.ffn_down_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
5014
+ layer.ffn_up_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false);
5009
5015
} else {
5010
5016
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
5011
5017
@@ -7133,9 +7139,9 @@ struct llm_build_context {
7133
7139
cb(cur, "ffn_norm", il);
7134
7140
7135
7141
cur = llm_build_ffn(ctx0, cur,
7136
- model.layers[il].ffn_up, NULL ,
7137
- model.layers[il].ffn_gate, NULL ,
7138
- model.layers[il].ffn_down, NULL ,
7142
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b ,
7143
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b ,
7144
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b ,
7139
7145
NULL,
7140
7146
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
7141
7147
cb(cur, "ffn_out", il);
0 commit comments