@@ -205,6 +205,7 @@ enum llm_arch {
205
205
LLM_ARCH_CODESHELL,
206
206
LLM_ARCH_ORION,
207
207
LLM_ARCH_INTERNLM2,
208
+ LLM_ARCH_MINICPM,
208
209
LLM_ARCH_UNKNOWN,
209
210
};
210
211
@@ -228,6 +229,7 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
228
229
{ LLM_ARCH_CODESHELL, " codeshell" },
229
230
{ LLM_ARCH_ORION, " orion" },
230
231
{ LLM_ARCH_INTERNLM2, " internlm2" },
232
+ { LLM_ARCH_MINICPM, " minicpm" },
231
233
};
232
234
233
235
enum llm_kv {
@@ -690,6 +692,29 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
690
692
{ LLM_TENSOR_FFN_UP, " blk.%d.ffn_up" },
691
693
},
692
694
},
695
+ {
696
+ LLM_ARCH_MINICPM,
697
+ {
698
+ { LLM_TENSOR_TOKEN_EMBD, " token_embd" },
699
+ { LLM_TENSOR_OUTPUT_NORM, " output_norm" },
700
+ { LLM_TENSOR_OUTPUT, " output" },
701
+ { LLM_TENSOR_ROPE_FREQS, " rope_freqs" },
702
+ { LLM_TENSOR_ATTN_NORM, " blk.%d.attn_norm" },
703
+ { LLM_TENSOR_ATTN_Q, " blk.%d.attn_q" },
704
+ { LLM_TENSOR_ATTN_K, " blk.%d.attn_k" },
705
+ { LLM_TENSOR_ATTN_V, " blk.%d.attn_v" },
706
+ { LLM_TENSOR_ATTN_OUT, " blk.%d.attn_output" },
707
+ { LLM_TENSOR_ATTN_ROT_EMBD, " blk.%d.attn_rot_embd" },
708
+ { LLM_TENSOR_FFN_GATE_INP, " blk.%d.ffn_gate_inp" },
709
+ { LLM_TENSOR_FFN_NORM, " blk.%d.ffn_norm" },
710
+ { LLM_TENSOR_FFN_GATE, " blk.%d.ffn_gate" },
711
+ { LLM_TENSOR_FFN_DOWN, " blk.%d.ffn_down" },
712
+ { LLM_TENSOR_FFN_UP, " blk.%d.ffn_up" },
713
+ { LLM_TENSOR_FFN_GATE_EXP, " blk.%d.ffn_gate.%d" },
714
+ { LLM_TENSOR_FFN_DOWN_EXP, " blk.%d.ffn_down.%d" },
715
+ { LLM_TENSOR_FFN_UP_EXP, " blk.%d.ffn_up.%d" },
716
+ },
717
+ },
693
718
{
694
719
LLM_ARCH_UNKNOWN,
695
720
{
@@ -1390,6 +1415,7 @@ enum e_model {
1390
1415
MODEL_UNKNOWN,
1391
1416
MODEL_0_5B,
1392
1417
MODEL_1B,
1418
+ MODEL_2B,
1393
1419
MODEL_3B,
1394
1420
MODEL_4B,
1395
1421
MODEL_7B,
@@ -2748,6 +2774,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2748
2774
static const char * llama_model_type_name (e_model type) {
2749
2775
switch (type) {
2750
2776
case MODEL_1B: return " 1B" ;
2777
+ case MODEL_2B: return " 2B" ;
2751
2778
case MODEL_3B: return " 3B" ;
2752
2779
case MODEL_7B: return " 7B" ;
2753
2780
case MODEL_8B: return " 8B" ;
@@ -2887,6 +2914,13 @@ static void llm_load_hparams(
2887
2914
default : model.type = e_model::MODEL_UNKNOWN;
2888
2915
}
2889
2916
} break ;
2917
+ case LLM_ARCH_MINICPM:
2918
+ {
2919
+ switch (hparams.n_layer ) {
2920
+ case 40 : model.type = e_model::MODEL_2B; break ;
2921
+ default : model.type = e_model::MODEL_UNKNOWN;
2922
+ }
2923
+ } break ;
2890
2924
case LLM_ARCH_FALCON:
2891
2925
{
2892
2926
ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
@@ -3524,13 +3558,16 @@ static bool llm_load_tensors(
3524
3558
switch (model.arch ) {
3525
3559
case LLM_ARCH_LLAMA:
3526
3560
case LLM_ARCH_REFACT:
3561
+ case LLM_ARCH_MINICPM:
3527
3562
{
3528
3563
model.tok_embd = ml.create_tensor (ctx_input, tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab});
3529
3564
3530
3565
// output
3531
3566
{
3532
3567
model.output_norm = ml.create_tensor (ctx_output, tn (LLM_TENSOR_OUTPUT_NORM, " weight" ), {n_embd});
3533
- model.output = ml.create_tensor (ctx_output_split, tn (LLM_TENSOR_OUTPUT, " weight" ), {n_embd, n_vocab});
3568
+ if (model.arch != LLM_ARCH_MINICPM){
3569
+ model.output = ml.create_tensor (ctx_output_split, tn (LLM_TENSOR_OUTPUT, " weight" ), {n_embd, n_vocab});
3570
+ }
3534
3571
}
3535
3572
3536
3573
for (int i = 0 ; i < n_layer; ++i) {
@@ -6781,6 +6818,153 @@ struct llm_build_context {
6781
6818
return gf;
6782
6819
}
6783
6820
6821
+ // ref: https://arxiv.org/abs/2203.03466
6822
+ // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
6823
+ // based on the original build_llama() function
6824
+ struct ggml_cgraph * build_minicpm () {
6825
+ struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, LLAMA_MAX_NODES, false );
6826
+
6827
+ const int64_t n_embd_head = hparams.n_embd_head_v ;
6828
+ GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
6829
+ GGML_ASSERT (n_embd_head == hparams.n_rot );
6830
+
6831
+ const int64_t n_embd = hparams.n_embd ;
6832
+ // TODO: if the model varies, these parameters need to be read from the model
6833
+ const int64_t n_embd_base = 256 ;
6834
+ const float scale_embd = 12 .0f ;
6835
+ const float scale_depth = 1 .4f ;
6836
+
6837
+ struct ggml_tensor * cur;
6838
+ struct ggml_tensor * inpL;
6839
+
6840
+ inpL = llm_build_inp_embd (ctx0, hparams, batch, model.tok_embd , lctx.inp_tokens , lctx.inp_embd , cb);
6841
+ cb (inpL, " inp_embd" , -1 );
6842
+
6843
+ // scale the input embeddings
6844
+ inpL = ggml_scale (ctx0, inpL, scale_embd);
6845
+ cb (inpL, " inp_scaled" , -1 );
6846
+
6847
+ // inp_pos - contains the positions
6848
+ struct ggml_tensor * inp_pos = ggml_view_1d (ctx0, lctx.inp_pos , n_tokens, 0 );
6849
+ cb (inp_pos, " inp_pos" , -1 );
6850
+
6851
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
6852
+ struct ggml_tensor * KQ_mask = ggml_view_2d (ctx0, lctx.inp_KQ_mask , n_kv, n_tokens, n_kv*ggml_type_size (lctx.inp_KQ_mask ->type ), 0 );
6853
+ cb (KQ_mask, " KQ_mask" , -1 );
6854
+
6855
+ // shift the entire K-cache if needed
6856
+ if (do_rope_shift) {
6857
+ llm_build_k_shift (ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift , LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
6858
+ }
6859
+
6860
+ for (int il = 0 ; il < n_layer; ++il) {
6861
+ struct ggml_tensor * inpSA = inpL;
6862
+
6863
+ // norm
6864
+ cur = llm_build_norm (ctx0, inpL, hparams,
6865
+ model.layers [il].attn_norm , NULL ,
6866
+ LLM_NORM_RMS, cb, il);
6867
+ cb (cur, " attn_norm" , il);
6868
+
6869
+ // self-attention
6870
+ {
6871
+ // compute Q and K and RoPE them
6872
+ struct ggml_tensor * Qcur = ggml_mul_mat (ctx0, model.layers [il].wq , cur);
6873
+ cb (Qcur, " Qcur" , il);
6874
+ if (model.layers [il].bq ) {
6875
+ Qcur = ggml_add (ctx0, Qcur, model.layers [il].bq );
6876
+ cb (Qcur, " Qcur" , il);
6877
+ }
6878
+
6879
+ struct ggml_tensor * Kcur = ggml_mul_mat (ctx0, model.layers [il].wk , cur);
6880
+ cb (Kcur, " Kcur" , il);
6881
+ if (model.layers [il].bk ) {
6882
+ Kcur = ggml_add (ctx0, Kcur, model.layers [il].bk );
6883
+ cb (Kcur, " Kcur" , il);
6884
+ }
6885
+
6886
+ struct ggml_tensor * Vcur = ggml_mul_mat (ctx0, model.layers [il].wv , cur);
6887
+ cb (Vcur, " Vcur" , il);
6888
+ if (model.layers [il].bv ) {
6889
+ Vcur = ggml_add (ctx0, Vcur, model.layers [il].bv );
6890
+ cb (Vcur, " Vcur" , il);
6891
+ }
6892
+
6893
+ Qcur = ggml_rope_custom (
6894
+ ctx0, ggml_reshape_3d (ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
6895
+ hparams.n_rot , 0 , 0 , n_orig_ctx, freq_base, freq_scale,
6896
+ ext_factor, attn_factor, beta_fast, beta_slow
6897
+ );
6898
+ cb (Qcur, " Qcur" , il);
6899
+
6900
+ Kcur = ggml_rope_custom (
6901
+ ctx0, ggml_reshape_3d (ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
6902
+ hparams.n_rot , 0 , 0 , n_orig_ctx, freq_base, freq_scale,
6903
+ ext_factor, attn_factor, beta_fast, beta_slow
6904
+ );
6905
+ cb (Kcur, " Kcur" , il);
6906
+
6907
+ cur = llm_build_kv (ctx0, model, hparams, kv_self, gf,
6908
+ model.layers [il].wo , model.layers [il].bo ,
6909
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1 .0f , 1 .0f /sqrtf (float (n_embd_head)), cb, il);
6910
+ cb (cur, " kqv_out" , il);
6911
+ }
6912
+
6913
+ // scale_res - scale the hidden states for residual connection
6914
+ const float scale_res = scale_depth/sqrtf (float (n_layer));
6915
+ cur = ggml_scale (ctx0, cur, scale_res);
6916
+ cb (cur, " hidden_scaled" , -1 );
6917
+
6918
+ struct ggml_tensor * ffn_inp = ggml_add (ctx0, cur, inpSA);
6919
+ cb (ffn_inp, " ffn_inp" , il);
6920
+
6921
+ // feed-forward network
6922
+ {
6923
+ cur = llm_build_norm (ctx0, ffn_inp, hparams,
6924
+ model.layers [il].ffn_norm , NULL ,
6925
+ LLM_NORM_RMS, cb, il);
6926
+ cb (cur, " ffn_norm" , il);
6927
+
6928
+ cur = llm_build_ffn (ctx0, cur,
6929
+ model.layers [il].ffn_up , NULL ,
6930
+ model.layers [il].ffn_gate , NULL ,
6931
+ model.layers [il].ffn_down , NULL ,
6932
+ NULL ,
6933
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6934
+ cb (cur, " ffn_out" , il);
6935
+ }
6936
+
6937
+ // scale the hidden states for residual connection
6938
+ cur = ggml_scale (ctx0, cur, scale_res);
6939
+ cb (cur, " hidden_scaled_ffn" , -1 );
6940
+
6941
+ cur = ggml_add (ctx0, cur, ffn_inp);
6942
+ cb (cur, " l_out" , il);
6943
+
6944
+ // input for next layer
6945
+ inpL = cur;
6946
+ }
6947
+
6948
+ cur = inpL;
6949
+
6950
+ cur = llm_build_norm (ctx0, cur, hparams,
6951
+ model.output_norm , NULL ,
6952
+ LLM_NORM_RMS, cb, -1 );
6953
+ cb (cur, " result_norm" , -1 );
6954
+
6955
+ // lm_head scaling
6956
+ const float scale_lmhead = float (n_embd_base)/float (n_embd);
6957
+ cur = ggml_scale (ctx0, cur, scale_lmhead);
6958
+ cb (cur, " lmhead_scaling" , -1 );
6959
+
6960
+ // lm_head
6961
+ cur = ggml_mul_mat (ctx0, model.tok_embd , cur);
6962
+ cb (cur, " result_output" , -1 );
6963
+
6964
+ ggml_build_forward_expand (gf, cur);
6965
+
6966
+ return gf;
6967
+ }
6784
6968
};
6785
6969
6786
6970
static struct ggml_cgraph * llama_build_graph (
@@ -6943,6 +7127,10 @@ static struct ggml_cgraph * llama_build_graph(
6943
7127
{
6944
7128
result = llm.build_internlm2 ();
6945
7129
} break ;
7130
+ case LLM_ARCH_MINICPM:
7131
+ {
7132
+ result = llm.build_minicpm ();
7133
+ } break ;
6946
7134
default :
6947
7135
GGML_ASSERT (false );
6948
7136
}
0 commit comments