@@ -165,6 +165,7 @@ enum llm_arch {
165
165
LLM_ARCH_QWEN2MOE,
166
166
LLM_ARCH_PHI2,
167
167
LLM_ARCH_PHI3,
168
+ LLM_ARCH_PHIMOE,
168
169
LLM_ARCH_PLAMO,
169
170
LLM_ARCH_CODESHELL,
170
171
LLM_ARCH_ORION,
@@ -219,6 +220,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
219
220
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
220
221
{ LLM_ARCH_PHI2, "phi2" },
221
222
{ LLM_ARCH_PHI3, "phi3" },
223
+ { LLM_ARCH_PHIMOE, "phimoe" },
222
224
{ LLM_ARCH_PLAMO, "plamo" },
223
225
{ LLM_ARCH_CODESHELL, "codeshell" },
224
226
{ LLM_ARCH_ORION, "orion" },
@@ -955,6 +957,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
955
957
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
956
958
},
957
959
},
960
+ {
961
+ LLM_ARCH_PHIMOE,
962
+ {
963
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
964
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
965
+ { LLM_TENSOR_OUTPUT, "output" },
966
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
967
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
968
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
969
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
970
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
971
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
972
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
973
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
974
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
975
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
976
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
977
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
978
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
979
+ },
980
+ },
958
981
{
959
982
LLM_ARCH_PLAMO,
960
983
{
@@ -2428,6 +2451,7 @@ enum e_model {
2428
2451
MODEL_8x7B,
2429
2452
MODEL_8x22B,
2430
2453
MODEL_16x12B,
2454
+ MODEL_16x3_8B,
2431
2455
MODEL_10B_128x3_66B,
2432
2456
MODEL_57B_A14B,
2433
2457
MODEL_27B,
@@ -5412,6 +5436,7 @@ static const char * llama_model_type_name(e_model type) {
5412
5436
case MODEL_8x7B: return "8x7B";
5413
5437
case MODEL_8x22B: return "8x22B";
5414
5438
case MODEL_16x12B: return "16x12B";
5439
+ case MODEL_16x3_8B: return "16x3.8B";
5415
5440
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
5416
5441
case MODEL_57B_A14B: return "57B.A14B";
5417
5442
case MODEL_27B: return "27B";
@@ -5817,6 +5842,15 @@ static void llm_load_hparams(
5817
5842
throw std::runtime_error("invalid value for sliding_window");
5818
5843
}
5819
5844
} break;
5845
+ case LLM_ARCH_PHIMOE:
5846
+ {
5847
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5848
+
5849
+ switch (hparams.n_layer) {
5850
+ case 32: model.type = e_model::MODEL_16x3_8B; break;
5851
+ default: model.type = e_model::MODEL_UNKNOWN;
5852
+ }
5853
+ } break;
5820
5854
case LLM_ARCH_PLAMO:
5821
5855
{
5822
5856
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -8325,6 +8359,50 @@ static bool llm_load_tensors(
8325
8359
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
8326
8360
}
8327
8361
} break;
8362
+ case LLM_ARCH_PHIMOE:
8363
+ {
8364
+ const int64_t n_embd_head = n_embd / n_head;
8365
+
8366
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
8367
+
8368
+ // output
8369
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
8370
+ model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
8371
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
8372
+ model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
8373
+
8374
+ for (int i = 0; i < n_layer; ++i) {
8375
+ auto & layer = model.layers[i];
8376
+
8377
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
8378
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
8379
+
8380
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
8381
+ if (layer.wqkv == nullptr) {
8382
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
8383
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
8384
+
8385
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
8386
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
8387
+
8388
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
8389
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
8390
+ }
8391
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
8392
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
8393
+
8394
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
8395
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
8396
+
8397
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
8398
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
8399
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
8400
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
8401
+
8402
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
8403
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
8404
+ }
8405
+ } break;
8328
8406
case LLM_ARCH_PLAMO:
8329
8407
{
8330
8408
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -16680,6 +16758,7 @@ static struct ggml_cgraph * llama_build_graph(
16680
16758
result = llm.build_phi2();
16681
16759
} break;
16682
16760
case LLM_ARCH_PHI3:
16761
+ case LLM_ARCH_PHIMOE:
16683
16762
{
16684
16763
result = llm.build_phi3();
16685
16764
} break;
@@ -20012,6 +20091,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
20012
20091
case LLM_ARCH_OLMOE:
20013
20092
case LLM_ARCH_PHI2:
20014
20093
case LLM_ARCH_PHI3:
20094
+ case LLM_ARCH_PHIMOE:
20015
20095
case LLM_ARCH_GEMMA:
20016
20096
case LLM_ARCH_GEMMA2:
20017
20097
case LLM_ARCH_STARCODER2:
0 commit comments