@@ -2821,35 +2821,34 @@ struct llama_context {
2821
2821
struct llama_control_vector cvec;
2822
2822
2823
2823
// lora adapters and scales
2824
- std::map <struct llama_lora_adapter *, float> lora_adapters;
2824
+ std::unordered_map <struct llama_lora_adapter *, float> lora_adapters;
2825
2825
};
2826
2826
2827
- struct lora_weight {
2827
+ struct llama_lora_weight {
2828
2828
struct ggml_tensor * a = nullptr;
2829
2829
struct ggml_tensor * b = nullptr;
2830
- lora_weight () {}
2831
- lora_weight (struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
2830
+ llama_lora_weight () {}
2831
+ llama_lora_weight (struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
2832
2832
};
2833
2833
2834
2834
struct llama_lora_adapter {
2835
2835
struct llama_model * base_model;
2836
2836
// map tensor name to lora_a_b
2837
- std::map <std::string, struct lora_weight > ab_map;
2837
+ std::unordered_map <std::string, struct llama_lora_weight > ab_map;
2838
2838
std::vector<struct ggml_context *> ctxs;
2839
2839
std::vector<ggml_backend_buffer_t> bufs;
2840
2840
2841
2841
llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
2842
2842
base_model->lora_adapters.insert(this);
2843
2843
}
2844
2844
2845
- bool has_weight (struct ggml_tensor * w) {
2845
+ llama_lora_weight * get_weight (struct ggml_tensor * w) {
2846
2846
std::string name(w->name);
2847
- return ab_map.find(name) != ab_map.end();
2848
- }
2849
-
2850
- lora_weight & get_weight(struct ggml_tensor * w) {
2851
- std::string name(w->name);
2852
- return ab_map.at(name);
2847
+ auto pos = ab_map.find(name);
2848
+ if (ab_map.find(name) != ab_map.end()) {
2849
+ return &pos->second;
2850
+ }
2851
+ return nullptr;
2853
2852
}
2854
2853
2855
2854
~llama_lora_adapter() {
@@ -7855,23 +7854,22 @@ static void llm_build_kv_store(
7855
7854
}
7856
7855
7857
7856
// do mat_mul, while optionally apply lora
7858
- static struct ggml_tensor * llm_build_mm (
7857
+ static struct ggml_tensor * llm_build_lora_mm (
7859
7858
struct llama_context & lctx,
7860
7859
struct ggml_context * ctx0,
7861
7860
struct ggml_tensor * w,
7862
7861
struct ggml_tensor * cur) {
7863
7862
struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
7864
7863
for (auto & it : lctx.lora_adapters) {
7865
- struct llama_lora_adapter * adapter = it.first;
7864
+ struct llama_lora_weight * lora = it.first->get_weight(w) ;
7866
7865
float scale = it.second;
7867
- if (!adapter->has_weight(w) ) {
7866
+ if (lora == nullptr ) {
7868
7867
continue;
7869
7868
}
7870
- struct lora_weight & lora = adapter->get_weight(w);
7871
7869
// TODO: check if lora_a need transpose
7872
- struct ggml_tensor * a = ggml_cont(ctx0, ggml_transpose(ctx0, lora. a));
7870
+ struct ggml_tensor * a = ggml_cont(ctx0, ggml_transpose(ctx0, lora-> a));
7873
7871
struct ggml_tensor * ab_cur = ggml_mul_mat(
7874
- ctx0, lora. b,
7872
+ ctx0, lora-> b,
7875
7873
ggml_mul_mat(ctx0, a, cur)
7876
7874
);
7877
7875
ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale);
@@ -7930,7 +7928,7 @@ static struct ggml_tensor * llm_build_ffn(
7930
7928
llm_ffn_gate_type type_gate,
7931
7929
const llm_build_cb & cb,
7932
7930
int il) {
7933
- struct ggml_tensor * tmp = up ? llm_build_mm (lctx, ctx, up, cur) : cur;
7931
+ struct ggml_tensor * tmp = up ? llm_build_lora_mm (lctx, ctx, up, cur) : cur;
7934
7932
cb(tmp, "ffn_up", il);
7935
7933
7936
7934
if (up_b) {
@@ -7947,12 +7945,12 @@ static struct ggml_tensor * llm_build_ffn(
7947
7945
switch (type_gate) {
7948
7946
case LLM_FFN_SEQ:
7949
7947
{
7950
- cur = llm_build_mm (lctx, ctx, gate, tmp);
7948
+ cur = llm_build_lora_mm (lctx, ctx, gate, tmp);
7951
7949
cb(cur, "ffn_gate", il);
7952
7950
} break;
7953
7951
case LLM_FFN_PAR:
7954
7952
{
7955
- cur = llm_build_mm (lctx, ctx, gate, cur);
7953
+ cur = llm_build_lora_mm (lctx, ctx, gate, cur);
7956
7954
cb(cur, "ffn_gate", il);
7957
7955
} break;
7958
7956
}
@@ -8020,7 +8018,7 @@ static struct ggml_tensor * llm_build_ffn(
8020
8018
}
8021
8019
8022
8020
if (down) {
8023
- cur = llm_build_mm (lctx, ctx, down, cur);
8021
+ cur = llm_build_lora_mm (lctx, ctx, down, cur);
8024
8022
}
8025
8023
8026
8024
if (down_b) {
@@ -8058,7 +8056,7 @@ static struct ggml_tensor * llm_build_moe_ffn(
8058
8056
int64_t n_embd = cur->ne[0];
8059
8057
int64_t n_tokens = cur->ne[1];
8060
8058
8061
- ggml_tensor * logits = llm_build_mm (lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
8059
+ ggml_tensor * logits = llm_build_lora_mm (lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
8062
8060
cb(logits, "ffn_moe_logits", il);
8063
8061
8064
8062
ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
@@ -8199,7 +8197,7 @@ static struct ggml_tensor * llm_build_kqv(
8199
8197
8200
8198
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
8201
8199
} else {
8202
- struct ggml_tensor * kq = llm_build_mm(lctx, ctx, k, q);
8200
+ struct ggml_tensor * kq = ggml_mul_mat( ctx, k, q);
8203
8201
cb(kq, "kq", il);
8204
8202
8205
8203
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
@@ -8242,7 +8240,7 @@ static struct ggml_tensor * llm_build_kqv(
8242
8240
0);
8243
8241
cb(v, "v", il);
8244
8242
8245
- struct ggml_tensor * kqv = llm_build_mm(lctx, ctx, v, kq);
8243
+ struct ggml_tensor * kqv = ggml_mul_mat( ctx, v, kq);
8246
8244
cb(kqv, "kqv", il);
8247
8245
8248
8246
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
@@ -8255,7 +8253,7 @@ static struct ggml_tensor * llm_build_kqv(
8255
8253
ggml_build_forward_expand(graph, cur);
8256
8254
8257
8255
if (wo) {
8258
- cur = llm_build_mm (lctx, ctx, wo, cur);
8256
+ cur = llm_build_lora_mm (lctx, ctx, wo, cur);
8259
8257
}
8260
8258
8261
8259
if (wo_b) {
@@ -8762,21 +8760,21 @@ struct llm_build_context {
8762
8760
// self-attention
8763
8761
{
8764
8762
// compute Q and K and RoPE them
8765
- struct ggml_tensor * Qcur = llm_build_mm (lctx, ctx0, model.layers[il].wq, cur);
8763
+ struct ggml_tensor * Qcur = llm_build_lora_mm (lctx, ctx0, model.layers[il].wq, cur);
8766
8764
cb(Qcur, "Qcur", il);
8767
8765
if (model.layers[il].bq) {
8768
8766
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
8769
8767
cb(Qcur, "Qcur", il);
8770
8768
}
8771
8769
8772
- struct ggml_tensor * Kcur = llm_build_mm (lctx, ctx0, model.layers[il].wk, cur);
8770
+ struct ggml_tensor * Kcur = llm_build_lora_mm (lctx, ctx0, model.layers[il].wk, cur);
8773
8771
cb(Kcur, "Kcur", il);
8774
8772
if (model.layers[il].bk) {
8775
8773
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
8776
8774
cb(Kcur, "Kcur", il);
8777
8775
}
8778
8776
8779
- struct ggml_tensor * Vcur = llm_build_mm (lctx, ctx0, model.layers[il].wv, cur);
8777
+ struct ggml_tensor * Vcur = llm_build_lora_mm (lctx, ctx0, model.layers[il].wv, cur);
8780
8778
cb(Vcur, "Vcur", il);
8781
8779
if (model.layers[il].bv) {
8782
8780
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -8864,7 +8862,7 @@ struct llm_build_context {
8864
8862
cb(cur, "result_norm", -1);
8865
8863
8866
8864
// lm_head
8867
- cur = llm_build_mm (lctx, ctx0, model.output, cur);
8865
+ cur = llm_build_lora_mm (lctx, ctx0, model.output, cur);
8868
8866
cb(cur, "result_output", -1);
8869
8867
8870
8868
ggml_build_forward_expand(gf, cur);
@@ -18517,7 +18515,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
18517
18515
}
18518
18516
}
18519
18517
18520
- static int llama_lora_adapter_init_internal(const struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
18518
+ static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
18521
18519
static const int n_inp_tensors = 5; // see llama_model
18522
18520
static const int n_out_tensors = 5; // see llama_model
18523
18521
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
@@ -18532,7 +18530,7 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
18532
18530
struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
18533
18531
if (!ctx_gguf) {
18534
18532
LLAMA_LOG_ERROR("%s: failed to load lora adapter file from %s\n", __func__, path_lora);
18535
- return -1 ;
18533
+ throw std::exception() ;
18536
18534
}
18537
18535
18538
18536
// calculate n_tensors_per_layer
@@ -18574,7 +18572,7 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
18574
18572
}
18575
18573
18576
18574
// bundle lora_a and lora_b into pairs
18577
- std::map<std::string, lora_weight > ab_map;
18575
+ std::map<std::string, llama_lora_weight > ab_map;
18578
18576
auto str_endswith = [](const std::string & str, const std::string & suffix) {
18579
18577
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
18580
18578
};
@@ -18583,18 +18581,19 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
18583
18581
if (str_endswith(name, ".lora_a")) {
18584
18582
replace_all(name, ".lora_a", "");
18585
18583
if (ab_map.find(name) == ab_map.end()) {
18586
- ab_map[name] = lora_weight (cur, nullptr);
18584
+ ab_map[name] = llama_lora_weight (cur, nullptr);
18587
18585
} else {
18588
18586
ab_map[name].a = cur;
18589
18587
}
18590
18588
} else if (str_endswith(name, ".lora_b")) {
18591
18589
replace_all(name, ".lora_b", "");
18592
18590
if (ab_map.find(name) == ab_map.end()) {
18593
- ab_map[name] = lora_weight (nullptr, cur);
18591
+ ab_map[name] = llama_lora_weight (nullptr, cur);
18594
18592
} else {
18595
18593
ab_map[name].b = cur;
18596
18594
}
18597
18595
} else {
18596
+ // maybe "optimizer.*"" tensors
18598
18597
LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name);
18599
18598
}
18600
18599
}
@@ -18603,28 +18602,26 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
18603
18602
for (auto & it : ab_map) {
18604
18603
std::string name = it.first;
18605
18604
const char * cname = name.c_str();
18606
- lora_weight & w = it.second;
18605
+ llama_lora_weight & w = it.second;
18607
18606
GGML_ASSERT(w.a != nullptr);
18608
18607
GGML_ASSERT(w.b != nullptr);
18609
18608
int il = -1;
18610
18609
sscanf(cname, "blk.%d.", &il);
18611
- struct ggml_context * dev_ctx; // device ctx
18612
- if (il >= 0) {
18613
- dev_ctx = ctx_map.at(model->buft_layer[il].buft);
18614
- } else if (strstr(cname, "tok") == 0) {
18615
- dev_ctx = ctx_map.at(model->buft_input.buft);
18616
- } else if (strstr(cname, "output") == 0) {
18617
- dev_ctx = ctx_map.at(model->buft_output.buft);
18618
- } else {
18619
- LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cname);
18620
- continue;
18610
+ // device buft and device ctx
18611
+ auto model_tensor = llama_get_model_tensor(model, cname);
18612
+ if (!model_tensor) {
18613
+ gguf_free(ctx_gguf);
18614
+ ggml_free(ctx);
18615
+ throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model\n");
18621
18616
}
18617
+ struct ggml_context * dev_ctx = ctx_map.at(ggml_backend_buffer_get_type(model_tensor->buffer));
18618
+ // TODO: validate tensor shape
18622
18619
// LLAMA_LOG_INFO("%s %p %p\n", cname, w.a, w.b);
18623
18620
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
18624
18621
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
18625
18622
ggml_set_name(tensor_a, w.a->name);
18626
18623
ggml_set_name(tensor_b, w.b->name);
18627
- adapter.ab_map[name] = lora_weight (tensor_a, tensor_b);
18624
+ adapter.ab_map[name] = llama_lora_weight (tensor_a, tensor_b);
18628
18625
}
18629
18626
18630
18627
// allocate tensors / buffers and zero
@@ -18636,8 +18633,9 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
18636
18633
ggml_context * ctx_dev = it.second;
18637
18634
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft);
18638
18635
if (!buf) {
18639
- LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora adapter\n", __func__);
18640
- return -1;
18636
+ gguf_free(ctx_gguf);
18637
+ ggml_free(ctx);
18638
+ throw std::runtime_error("failed to allocate buffer for lora adapter\n");
18641
18639
}
18642
18640
ggml_backend_buffer_clear(buf, 0);
18643
18641
adapter.ctxs.push_back(ctx_dev);
@@ -18671,14 +18669,18 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
18671
18669
LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2);
18672
18670
18673
18671
// free ctx for reading gguf
18672
+ gguf_free(ctx_gguf);
18674
18673
ggml_free(ctx);
18675
- return 0;
18676
18674
}
18677
18675
18678
18676
int32_t llama_lora_adapter_set(
18679
18677
struct llama_context * ctx,
18680
18678
struct llama_lora_adapter * adapter,
18681
18679
float scale) {
18680
+ if (ctx->cparams.flash_attn) {
18681
+ LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__);
18682
+ return -1;
18683
+ }
18682
18684
ctx->lora_adapters[adapter] = scale;
18683
18685
return 0;
18684
18686
}
@@ -19479,8 +19481,8 @@ uint32_t llama_model_quantize(
19479
19481
struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
19480
19482
try {
19481
19483
struct llama_lora_adapter * adapter = new llama_lora_adapter(model);
19482
- int res = llama_lora_adapter_init_internal(model, path_lora, *adapter);
19483
- return res == 0 ? adapter : nullptr ;
19484
+ llama_lora_adapter_init_internal(model, path_lora, *adapter);
19485
+ return adapter;
19484
19486
} catch (const std::exception & err) {
19485
19487
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
19486
19488
return nullptr;
0 commit comments