@@ -282,6 +282,7 @@ enum llm_kv {
282
282
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
283
283
LLM_KV_FEED_FORWARD_LENGTH,
284
284
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
285
+ LLM_KV_SHARED_EXPERT_FEED_FORWARD_LENGTH,
285
286
LLM_KV_USE_PARALLEL_RESIDUAL,
286
287
LLM_KV_TENSOR_DATA_LAYOUT,
287
288
LLM_KV_EXPERT_COUNT,
@@ -360,21 +361,22 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
360
361
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
361
362
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
362
363
363
- { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
364
- { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
365
- { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
366
- { LLM_KV_BLOCK_COUNT, "%s.block_count" },
367
- { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
368
- { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
369
- { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
370
- { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
371
- { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
372
- { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
373
- { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
374
- { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
375
- { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
376
- { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
377
- { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
364
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
365
+ { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
366
+ { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
367
+ { LLM_KV_BLOCK_COUNT, "%s.block_count" },
368
+ { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
369
+ { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
370
+ { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
371
+ { LLM_KV_SHARED_EXPERT_FEED_FORWARD_LENGTH, "%s.shared_expert_feed_forward_length" },
372
+ { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
373
+ { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
374
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
375
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
376
+ { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
377
+ { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
378
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
379
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
378
380
379
381
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
380
382
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -1840,6 +1842,7 @@ struct llama_hparams {
1840
1842
uint32_t n_lora_q = 0;
1841
1843
uint32_t n_lora_kv = 0;
1842
1844
uint32_t n_ff_exp = 0;
1845
+ uint32_t n_ff_shexp = 0;
1843
1846
uint32_t n_expert_shared = 0;
1844
1847
float expert_weights_scale = 0.0;
1845
1848
@@ -1888,6 +1891,7 @@ struct llama_hparams {
1888
1891
if (this->n_lora_q != other.n_lora_q) return true;
1889
1892
if (this->n_lora_kv != other.n_lora_kv) return true;
1890
1893
if (this->n_ff_exp != other.n_ff_exp) return true;
1894
+ if (this->n_ff_shexp != other.n_ff_shexp) return true;
1891
1895
if (this->n_expert_shared != other.n_expert_shared) return true;
1892
1896
1893
1897
if (this->rope_finetuned != other.rope_finetuned) return true;
@@ -4248,6 +4252,7 @@ static void llm_load_hparams(
4248
4252
case LLM_ARCH_QWEN2MOE:
4249
4253
{
4250
4254
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
4255
+ ml.get_key(LLM_KV_SHARED_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
4251
4256
4252
4257
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4253
4258
switch (hparams.n_layer) {
@@ -5024,6 +5029,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5024
5029
5025
5030
if (model.arch == LLM_ARCH_QWEN2MOE) {
5026
5031
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5032
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
5027
5033
}
5028
5034
}
5029
5035
@@ -5817,11 +5823,11 @@ static bool llm_load_tensors(
5817
5823
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5818
5824
5819
5825
// Shared expert branch
5820
- auto n_ff_shared_exp = hparams.n_ff_exp && hparams.n_expert_used ? hparams.n_ff_exp * hparams.n_expert_used : n_ff;
5826
+ auto n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
5821
5827
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
5822
- layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shared_exp });
5823
- layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shared_exp , n_embd});
5824
- layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shared_exp });
5828
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp });
5829
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp , n_embd});
5830
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp });
5825
5831
}
5826
5832
} break;
5827
5833
case LLM_ARCH_PHI2:
0 commit comments