Skip to content

Commit bcf0194

Browse files
committed
https://github.com/ggerganov/llama.cpp/pull/11305
1 parent bf572ca commit bcf0194

File tree

1 file changed

+80
-0
lines changed

1 file changed

+80
-0
lines changed

src/llama.cpp

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ enum llm_arch {
165165
LLM_ARCH_QWEN2MOE,
166166
LLM_ARCH_PHI2,
167167
LLM_ARCH_PHI3,
168+
LLM_ARCH_PHIMOE,
168169
LLM_ARCH_PLAMO,
169170
LLM_ARCH_CODESHELL,
170171
LLM_ARCH_ORION,
@@ -219,6 +220,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
219220
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
220221
{ LLM_ARCH_PHI2, "phi2" },
221222
{ LLM_ARCH_PHI3, "phi3" },
223+
{ LLM_ARCH_PHIMOE, "phimoe" },
222224
{ LLM_ARCH_PLAMO, "plamo" },
223225
{ LLM_ARCH_CODESHELL, "codeshell" },
224226
{ LLM_ARCH_ORION, "orion" },
@@ -955,6 +957,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
955957
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
956958
},
957959
},
960+
{
961+
LLM_ARCH_PHIMOE,
962+
{
963+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
964+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
965+
{ LLM_TENSOR_OUTPUT, "output" },
966+
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
967+
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
968+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
969+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
970+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
971+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
972+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
973+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
974+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
975+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
976+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
977+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
978+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
979+
},
980+
},
958981
{
959982
LLM_ARCH_PLAMO,
960983
{
@@ -2428,6 +2451,7 @@ enum e_model {
24282451
MODEL_8x7B,
24292452
MODEL_8x22B,
24302453
MODEL_16x12B,
2454+
MODEL_16x3_8B,
24312455
MODEL_10B_128x3_66B,
24322456
MODEL_57B_A14B,
24332457
MODEL_27B,
@@ -5412,6 +5436,7 @@ static const char * llama_model_type_name(e_model type) {
54125436
case MODEL_8x7B: return "8x7B";
54135437
case MODEL_8x22B: return "8x22B";
54145438
case MODEL_16x12B: return "16x12B";
5439+
case MODEL_16x3_8B: return "16x3.8B";
54155440
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
54165441
case MODEL_57B_A14B: return "57B.A14B";
54175442
case MODEL_27B: return "27B";
@@ -5817,6 +5842,15 @@ static void llm_load_hparams(
58175842
throw std::runtime_error("invalid value for sliding_window");
58185843
}
58195844
} break;
5845+
case LLM_ARCH_PHIMOE:
5846+
{
5847+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5848+
5849+
switch (hparams.n_layer) {
5850+
case 32: model.type = e_model::MODEL_16x3_8B; break;
5851+
default: model.type = e_model::MODEL_UNKNOWN;
5852+
}
5853+
} break;
58205854
case LLM_ARCH_PLAMO:
58215855
{
58225856
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -8325,6 +8359,50 @@ static bool llm_load_tensors(
83258359
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
83268360
}
83278361
} break;
8362+
case LLM_ARCH_PHIMOE:
8363+
{
8364+
const int64_t n_embd_head = n_embd / n_head;
8365+
8366+
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
8367+
8368+
// output
8369+
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
8370+
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
8371+
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
8372+
model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
8373+
8374+
for (int i = 0; i < n_layer; ++i) {
8375+
auto & layer = model.layers[i];
8376+
8377+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
8378+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
8379+
8380+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
8381+
if (layer.wqkv == nullptr) {
8382+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
8383+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
8384+
8385+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
8386+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
8387+
8388+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
8389+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
8390+
}
8391+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
8392+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
8393+
8394+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
8395+
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
8396+
8397+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
8398+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
8399+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
8400+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
8401+
8402+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
8403+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
8404+
}
8405+
} break;
83288406
case LLM_ARCH_PLAMO:
83298407
{
83308408
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -16680,6 +16758,7 @@ static struct ggml_cgraph * llama_build_graph(
1668016758
result = llm.build_phi2();
1668116759
} break;
1668216760
case LLM_ARCH_PHI3:
16761+
case LLM_ARCH_PHIMOE:
1668316762
{
1668416763
result = llm.build_phi3();
1668516764
} break;
@@ -20012,6 +20091,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
2001220091
case LLM_ARCH_OLMOE:
2001320092
case LLM_ARCH_PHI2:
2001420093
case LLM_ARCH_PHI3:
20094+
case LLM_ARCH_PHIMOE:
2001520095
case LLM_ARCH_GEMMA:
2001620096
case LLM_ARCH_GEMMA2:
2001720097
case LLM_ARCH_STARCODER2:

0 commit comments

Comments
 (0)