@@ -7882,7 +7882,6 @@ static struct ggml_tensor * llm_build_lora_mm(
7882
7882
if (lora == nullptr) {
7883
7883
continue;
7884
7884
}
7885
- // TODO: check if lora_a need transpose
7886
7885
struct ggml_tensor * ab_cur = ggml_mul_mat(
7887
7886
ctx0, lora->b,
7888
7887
ggml_mul_mat(ctx0, lora->a, cur)
@@ -7893,6 +7892,31 @@ static struct ggml_tensor * llm_build_lora_mm(
7893
7892
return res;
7894
7893
}
7895
7894
7895
+ // do mat_mul_id, while optionally apply lora
7896
+ static struct ggml_tensor * llm_build_lora_mm_id(
7897
+ struct llama_context & lctx,
7898
+ struct ggml_context * ctx0,
7899
+ struct ggml_tensor * w, // struct ggml_tensor * as
7900
+ struct ggml_tensor * cur, // struct ggml_tensor * b
7901
+ struct ggml_tensor * ids) {
7902
+ struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
7903
+ for (auto & it : lctx.lora_adapters) {
7904
+ struct llama_lora_weight * lora = it.first->get_weight(w);
7905
+ float scale = it.second;
7906
+ if (lora == nullptr) {
7907
+ continue;
7908
+ }
7909
+ struct ggml_tensor * ab_cur = ggml_mul_mat_id(
7910
+ ctx0, lora->b,
7911
+ ggml_mul_mat_id(ctx0, lora->a, cur, ids),
7912
+ ids
7913
+ );
7914
+ ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale);
7915
+ res = ggml_add(ctx0, res, ab_cur);
7916
+ }
7917
+ return res;
7918
+ }
7919
+
7896
7920
static struct ggml_tensor * llm_build_norm(
7897
7921
struct ggml_context * ctx,
7898
7922
struct ggml_tensor * cur,
@@ -8103,10 +8127,10 @@ static struct ggml_tensor * llm_build_moe_ffn(
8103
8127
}
8104
8128
8105
8129
cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
8106
- ggml_tensor * up = ggml_mul_mat_id( ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
8130
+ ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
8107
8131
cb(up, "ffn_moe_up", il);
8108
8132
8109
- ggml_tensor * gate = ggml_mul_mat_id( ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
8133
+ ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
8110
8134
cb(gate, "ffn_moe_gate", il);
8111
8135
8112
8136
switch (type_op) {
@@ -8127,7 +8151,7 @@ static struct ggml_tensor * llm_build_moe_ffn(
8127
8151
ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
8128
8152
cb(par, "ffn_moe_gate_par", il);
8129
8153
8130
- ggml_tensor * experts = ggml_mul_mat_id( ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
8154
+ ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
8131
8155
cb(experts, "ffn_moe_down", il);
8132
8156
8133
8157
experts = ggml_mul(ctx, experts, weights);
0 commit comments