@@ -47,6 +47,7 @@ const char * llm_type_name(llm_type type) {
47
47
case LLM_TYPE_1_4B: return "1.4B";
48
48
case LLM_TYPE_1_5B: return "1.5B";
49
49
case LLM_TYPE_1_6B: return "1.6B";
50
+ case LLM_TYPE_1_8B: return "1.8B";
50
51
case LLM_TYPE_2B: return "2B";
51
52
case LLM_TYPE_2_8B: return "2.8B";
52
53
case LLM_TYPE_2_9B: return "2.9B";
@@ -1144,6 +1145,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1144
1145
default: type = LLM_TYPE_UNKNOWN;
1145
1146
}
1146
1147
} break;
1148
+ case LLM_ARCH_PLM:
1149
+ {
1150
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1151
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
1152
+ switch (hparams.n_layer) {
1153
+ case 32: type = LLM_TYPE_1_8B; break;
1154
+ default: type = LLM_TYPE_UNKNOWN;
1155
+ }
1156
+ } break;
1147
1157
case LLM_ARCH_CHATGLM:
1148
1158
{
1149
1159
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -3068,6 +3078,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
3068
3078
}
3069
3079
}
3070
3080
} break;
3081
+ case LLM_ARCH_PLM:
3082
+ {
3083
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
3084
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
3085
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
3086
+
3087
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
3088
+
3089
+ // output
3090
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
3091
+ // output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
3092
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
3093
+
3094
+ for (int i = 0; i < n_layer; ++i) {
3095
+ auto & layer = layers[i];
3096
+
3097
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
3098
+
3099
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
3100
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
3101
+ layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
3102
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
3103
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
3104
+
3105
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
3106
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
3107
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
3108
+ }
3109
+ } break;
3071
3110
case LLM_ARCH_BITNET:
3072
3111
{
3073
3112
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -11615,6 +11654,178 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context {
11615
11654
}
11616
11655
};
11617
11656
11657
+ struct llm_build_plm : public llm_graph_context {
11658
+ llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
11659
+ const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
11660
+
11661
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
11662
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
11663
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
11664
+
11665
+ ggml_tensor * cur;
11666
+ ggml_tensor * inpL;
11667
+
11668
+ // {n_embd, n_tokens}
11669
+ inpL = build_inp_embd(model.tok_embd);
11670
+
11671
+ // inp_pos - contains the positions
11672
+ ggml_tensor * inp_pos = build_inp_pos();
11673
+
11674
+ auto * inp_attn = build_attn_inp_kv_unified();
11675
+
11676
+ for (int il = 0; il < n_layer; ++il) {
11677
+ ggml_tensor * inpSA = inpL;
11678
+
11679
+ // norm
11680
+ cur = build_norm(inpL,
11681
+ model.layers[il].attn_norm, NULL,
11682
+ LLM_NORM_RMS, il);
11683
+ cb(cur, "attn_norm", il);
11684
+
11685
+ // self_attention
11686
+ {
11687
+ ggml_tensor * q = NULL;
11688
+ q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
11689
+ cb(q, "q", il);
11690
+
11691
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
11692
+ ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
11693
+ ggml_row_size(q->type, hparams.n_embd_head_k),
11694
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
11695
+ 0);
11696
+ cb(q_nope, "q_nope", il);
11697
+
11698
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
11699
+ ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
11700
+ ggml_row_size(q->type, hparams.n_embd_head_k),
11701
+ ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
11702
+ ggml_row_size(q->type, n_embd_head_qk_nope));
11703
+ cb(q_pe, "q_pe", il);
11704
+
11705
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
11706
+ ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
11707
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
11708
+
11709
+ // split into {kv_lora_rank, n_tokens}
11710
+ ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
11711
+ kv_pe_compresseed->nb[1],
11712
+ 0);
11713
+ cb(kv_compressed, "kv_compressed", il);
11714
+
11715
+ // and {n_embd_head_qk_rope, n_tokens}
11716
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
11717
+ kv_pe_compresseed->nb[1],
11718
+ kv_pe_compresseed->nb[1],
11719
+ ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
11720
+ cb(k_pe, "k_pe", il);
11721
+
11722
+ kv_compressed = build_norm(kv_compressed,
11723
+ model.layers[il].attn_kv_a_norm, NULL,
11724
+ LLM_NORM_RMS, il);
11725
+ cb(kv_compressed, "kv_compressed", il);
11726
+
11727
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
11728
+ ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
11729
+ cb(kv, "kv", il);
11730
+
11731
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
11732
+ ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
11733
+ ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
11734
+ ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
11735
+ 0);
11736
+ cb(k_nope, "k_nope", il);
11737
+
11738
+ // and {n_head * n_embd_head_v, n_tokens}
11739
+ ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
11740
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
11741
+ ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
11742
+ ggml_row_size(kv->type, (n_embd_head_qk_nope)));
11743
+ cb(v_states, "v_states", il);
11744
+
11745
+ v_states = ggml_cont(ctx0, v_states);
11746
+ cb(v_states, "v_states", il);
11747
+
11748
+ v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
11749
+ ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
11750
+ 0);
11751
+ cb(v_states, "v_states", il);
11752
+
11753
+ q_pe = ggml_rope_ext(
11754
+ ctx0, q_pe, inp_pos, nullptr,
11755
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11756
+ ext_factor, attn_factor, beta_fast, beta_slow
11757
+ );
11758
+ cb(q_pe, "q_pe", il);
11759
+
11760
+ // shared RoPE key
11761
+ k_pe = ggml_rope_ext(
11762
+ ctx0, k_pe, inp_pos, nullptr,
11763
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
11764
+ ext_factor, attn_factor, beta_fast, beta_slow
11765
+ );
11766
+ cb(k_pe, "k_pe", il);
11767
+
11768
+ ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
11769
+ cb(q_states, "q_states", il);
11770
+
11771
+ ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
11772
+ cb(k_states, "k_states", il);
11773
+
11774
+ cur = build_attn(inp_attn, gf,
11775
+ model.layers[il].wo, NULL,
11776
+ q_states, k_states, v_states, nullptr, kq_scale, il);
11777
+ }
11778
+
11779
+ if (il == n_layer - 1) {
11780
+ // skip computing output for unused tokens
11781
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
11782
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
11783
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
11784
+ }
11785
+
11786
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
11787
+ cb(ffn_inp, "ffn_inp", il);
11788
+
11789
+ cur = build_norm(ffn_inp,
11790
+ model.layers[il].ffn_norm, NULL,
11791
+ LLM_NORM_RMS, il);
11792
+ cb(cur, "ffn_norm", il);
11793
+
11794
+ cur = build_ffn(cur,
11795
+ model.layers[il].ffn_up, NULL, NULL,
11796
+ NULL, NULL, NULL,
11797
+ model.layers[il].ffn_down, NULL, NULL,
11798
+ NULL,
11799
+ LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
11800
+ cb(cur, "ffn_out", il);
11801
+
11802
+ cur = ggml_add(ctx0, cur, ffn_inp);
11803
+
11804
+ cur = build_cvec(cur, il);
11805
+ cb(cur, "l_out", il);
11806
+
11807
+ // input for next layer
11808
+ inpL = cur;
11809
+ }
11810
+
11811
+ cur = inpL;
11812
+
11813
+ cur = build_norm(cur,
11814
+ model.output_norm, NULL,
11815
+ LLM_NORM_RMS, -1);
11816
+
11817
+ cb(cur, "result_norm", -1);
11818
+ res->t_embd = cur;
11819
+
11820
+ cur = build_lora_mm(model.output, cur);
11821
+
11822
+ cb(cur, "result_output", -1);
11823
+ res->t_logits = cur;
11824
+
11825
+ ggml_build_forward_expand(gf, cur);
11826
+ }
11827
+ };
11828
+
11618
11829
llama_memory_i * llama_model::create_memory() const {
11619
11830
llama_memory_i * res;
11620
11831
@@ -11887,6 +12098,10 @@ llm_graph_result_ptr llama_model::build_graph(
11887
12098
{
11888
12099
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
11889
12100
} break;
12101
+ case LLM_ARCH_PLM:
12102
+ {
12103
+ llm = std::make_unique<llm_build_plm>(*this, params, gf);
12104
+ } break;
11890
12105
default:
11891
12106
GGML_ABORT("fatal error");
11892
12107
}
@@ -12013,6 +12228,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
12013
12228
case LLM_ARCH_ARCTIC:
12014
12229
case LLM_ARCH_DEEPSEEK:
12015
12230
case LLM_ARCH_DEEPSEEK2:
12231
+ case LLM_ARCH_PLM:
12016
12232
case LLM_ARCH_CHATGLM:
12017
12233
case LLM_ARCH_GRANITE:
12018
12234
case LLM_ARCH_GRANITE_MOE:
0 commit comments