@@ -1922,16 +1922,6 @@ struct llama_layer {
1922
1922
// mamba bias
1923
1923
struct ggml_tensor * ssm_conv1d_b;
1924
1924
struct ggml_tensor * ssm_dt_b;
1925
-
1926
- //glu mlp (jina-bert)
1927
- struct ggml_tensor * mlp_gated_layer_w;
1928
-
1929
- struct ggml_tensor * mlp_wo_w;
1930
- struct ggml_tensor * mlp_wo_b;
1931
-
1932
- struct ggml_tensor * mlp_norm_w;
1933
- struct ggml_tensor * mlp_norm_b;
1934
-
1935
1925
};
1936
1926
1937
1927
struct llama_kv_cell {
@@ -4904,13 +4894,13 @@ static bool llm_load_tensors(
4904
4894
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
4905
4895
4906
4896
// TODO: HANDLE ALL THE MLP
4907
- layer.mlp_gated_layer_w = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE , "weight", i), {n_embd, 2 * n_ff});
4897
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP , "weight", i), {n_embd, 2 * n_ff});
4908
4898
4909
- layer.mlp_wo_w = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
4910
- layer.mlp_wo_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
4899
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
4900
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
4911
4901
4912
- layer.mlp_norm_w = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4913
- layer.mlp_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
4902
+ layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
4903
+ layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
4914
4904
}
4915
4905
} break;
4916
4906
case LLM_ARCH_BLOOM:
@@ -7564,7 +7554,7 @@ struct llm_build_context {
7564
7554
struct ggml_tensor * Vcur;
7565
7555
7566
7556
// self-attention
7567
- if (model.arch == LLM_ARCH_BERT) {
7557
+ if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT ) {
7568
7558
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
7569
7559
cb(Qcur, "Qcur", il);
7570
7560
@@ -7654,7 +7644,7 @@ struct llm_build_context {
7654
7644
cb(ffn_inp, "ffn_inp", il);
7655
7645
7656
7646
// feed-forward network
7657
- if (model.arch == LLM_ARCH_BERT) {
7647
+ if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT ) {
7658
7648
cur = llm_build_ffn(ctx0, cur,
7659
7649
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
7660
7650
NULL, NULL,
@@ -7677,6 +7667,7 @@ struct llm_build_context {
7677
7667
// output layer norm
7678
7668
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
7679
7669
7670
+
7680
7671
// input for next layer
7681
7672
inpL = cur;
7682
7673
}
0 commit comments