Skip to content

Commit 79e2982

Browse files
committed
update based on review comments
1 parent 30faf1f commit 79e2982

File tree

1 file changed

+54
-52
lines changed

1 file changed

+54
-52
lines changed

src/llama.cpp

Lines changed: 54 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -2821,35 +2821,34 @@ struct llama_context {
28212821
struct llama_control_vector cvec;
28222822

28232823
// lora adapters and scales
2824-
std::map<struct llama_lora_adapter *, float> lora_adapters;
2824+
std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
28252825
};
28262826

2827-
struct lora_weight {
2827+
struct llama_lora_weight {
28282828
struct ggml_tensor * a = nullptr;
28292829
struct ggml_tensor * b = nullptr;
2830-
lora_weight() {}
2831-
lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
2830+
llama_lora_weight() {}
2831+
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
28322832
};
28332833

28342834
struct llama_lora_adapter {
28352835
struct llama_model * base_model;
28362836
// map tensor name to lora_a_b
2837-
std::map<std::string, struct lora_weight> ab_map;
2837+
std::unordered_map<std::string, struct llama_lora_weight> ab_map;
28382838
std::vector<struct ggml_context *> ctxs;
28392839
std::vector<ggml_backend_buffer_t> bufs;
28402840

28412841
llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
28422842
base_model->lora_adapters.insert(this);
28432843
}
28442844

2845-
bool has_weight(struct ggml_tensor * w) {
2845+
llama_lora_weight * get_weight(struct ggml_tensor * w) {
28462846
std::string name(w->name);
2847-
return ab_map.find(name) != ab_map.end();
2848-
}
2849-
2850-
lora_weight & get_weight(struct ggml_tensor * w) {
2851-
std::string name(w->name);
2852-
return ab_map.at(name);
2847+
auto pos = ab_map.find(name);
2848+
if (ab_map.find(name) != ab_map.end()) {
2849+
return &pos->second;
2850+
}
2851+
return nullptr;
28532852
}
28542853

28552854
~llama_lora_adapter() {
@@ -7855,23 +7854,22 @@ static void llm_build_kv_store(
78557854
}
78567855

78577856
// do mat_mul, while optionally apply lora
7858-
static struct ggml_tensor * llm_build_mm(
7857+
static struct ggml_tensor * llm_build_lora_mm(
78597858
struct llama_context & lctx,
78607859
struct ggml_context * ctx0,
78617860
struct ggml_tensor * w,
78627861
struct ggml_tensor * cur) {
78637862
struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
78647863
for (auto & it : lctx.lora_adapters) {
7865-
struct llama_lora_adapter * adapter = it.first;
7864+
struct llama_lora_weight * lora = it.first->get_weight(w);
78667865
float scale = it.second;
7867-
if (!adapter->has_weight(w)) {
7866+
if (lora == nullptr) {
78687867
continue;
78697868
}
7870-
struct lora_weight & lora = adapter->get_weight(w);
78717869
// TODO: check if lora_a need transpose
7872-
struct ggml_tensor * a = ggml_cont(ctx0, ggml_transpose(ctx0, lora.a));
7870+
struct ggml_tensor * a = ggml_cont(ctx0, ggml_transpose(ctx0, lora->a));
78737871
struct ggml_tensor * ab_cur = ggml_mul_mat(
7874-
ctx0, lora.b,
7872+
ctx0, lora->b,
78757873
ggml_mul_mat(ctx0, a, cur)
78767874
);
78777875
ab_cur = ggml_scale_inplace(ctx0, ab_cur, scale);
@@ -7930,7 +7928,7 @@ static struct ggml_tensor * llm_build_ffn(
79307928
llm_ffn_gate_type type_gate,
79317929
const llm_build_cb & cb,
79327930
int il) {
7933-
struct ggml_tensor * tmp = up ? llm_build_mm(lctx, ctx, up, cur) : cur;
7931+
struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur;
79347932
cb(tmp, "ffn_up", il);
79357933

79367934
if (up_b) {
@@ -7947,12 +7945,12 @@ static struct ggml_tensor * llm_build_ffn(
79477945
switch (type_gate) {
79487946
case LLM_FFN_SEQ:
79497947
{
7950-
cur = llm_build_mm(lctx, ctx, gate, tmp);
7948+
cur = llm_build_lora_mm(lctx, ctx, gate, tmp);
79517949
cb(cur, "ffn_gate", il);
79527950
} break;
79537951
case LLM_FFN_PAR:
79547952
{
7955-
cur = llm_build_mm(lctx, ctx, gate, cur);
7953+
cur = llm_build_lora_mm(lctx, ctx, gate, cur);
79567954
cb(cur, "ffn_gate", il);
79577955
} break;
79587956
}
@@ -8020,7 +8018,7 @@ static struct ggml_tensor * llm_build_ffn(
80208018
}
80218019

80228020
if (down) {
8023-
cur = llm_build_mm(lctx, ctx, down, cur);
8021+
cur = llm_build_lora_mm(lctx, ctx, down, cur);
80248022
}
80258023

80268024
if (down_b) {
@@ -8058,7 +8056,7 @@ static struct ggml_tensor * llm_build_moe_ffn(
80588056
int64_t n_embd = cur->ne[0];
80598057
int64_t n_tokens = cur->ne[1];
80608058

8061-
ggml_tensor * logits = llm_build_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
8059+
ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
80628060
cb(logits, "ffn_moe_logits", il);
80638061

80648062
ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
@@ -8199,7 +8197,7 @@ static struct ggml_tensor * llm_build_kqv(
81998197

82008198
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
82018199
} else {
8202-
struct ggml_tensor * kq = llm_build_mm(lctx, ctx, k, q);
8200+
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
82038201
cb(kq, "kq", il);
82048202

82058203
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
@@ -8242,7 +8240,7 @@ static struct ggml_tensor * llm_build_kqv(
82428240
0);
82438241
cb(v, "v", il);
82448242

8245-
struct ggml_tensor * kqv = llm_build_mm(lctx, ctx, v, kq);
8243+
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
82468244
cb(kqv, "kqv", il);
82478245

82488246
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
@@ -8255,7 +8253,7 @@ static struct ggml_tensor * llm_build_kqv(
82558253
ggml_build_forward_expand(graph, cur);
82568254

82578255
if (wo) {
8258-
cur = llm_build_mm(lctx, ctx, wo, cur);
8256+
cur = llm_build_lora_mm(lctx, ctx, wo, cur);
82598257
}
82608258

82618259
if (wo_b) {
@@ -8762,21 +8760,21 @@ struct llm_build_context {
87628760
// self-attention
87638761
{
87648762
// compute Q and K and RoPE them
8765-
struct ggml_tensor * Qcur = llm_build_mm(lctx, ctx0, model.layers[il].wq, cur);
8763+
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
87668764
cb(Qcur, "Qcur", il);
87678765
if (model.layers[il].bq) {
87688766
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
87698767
cb(Qcur, "Qcur", il);
87708768
}
87718769

8772-
struct ggml_tensor * Kcur = llm_build_mm(lctx, ctx0, model.layers[il].wk, cur);
8770+
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
87738771
cb(Kcur, "Kcur", il);
87748772
if (model.layers[il].bk) {
87758773
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
87768774
cb(Kcur, "Kcur", il);
87778775
}
87788776

8779-
struct ggml_tensor * Vcur = llm_build_mm(lctx, ctx0, model.layers[il].wv, cur);
8777+
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
87808778
cb(Vcur, "Vcur", il);
87818779
if (model.layers[il].bv) {
87828780
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@@ -8864,7 +8862,7 @@ struct llm_build_context {
88648862
cb(cur, "result_norm", -1);
88658863

88668864
// lm_head
8867-
cur = llm_build_mm(lctx, ctx0, model.output, cur);
8865+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
88688866
cb(cur, "result_output", -1);
88698867

88708868
ggml_build_forward_expand(gf, cur);
@@ -18517,7 +18515,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1851718515
}
1851818516
}
1851918517

18520-
static int llama_lora_adapter_init_internal(const struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
18518+
static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
1852118519
static const int n_inp_tensors = 5; // see llama_model
1852218520
static const int n_out_tensors = 5; // see llama_model
1852318521
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
@@ -18532,7 +18530,7 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
1853218530
struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
1853318531
if (!ctx_gguf) {
1853418532
LLAMA_LOG_ERROR("%s: failed to load lora adapter file from %s\n", __func__, path_lora);
18535-
return -1;
18533+
throw std::exception();
1853618534
}
1853718535

1853818536
// calculate n_tensors_per_layer
@@ -18574,7 +18572,7 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
1857418572
}
1857518573

1857618574
// bundle lora_a and lora_b into pairs
18577-
std::map<std::string, lora_weight> ab_map;
18575+
std::map<std::string, llama_lora_weight> ab_map;
1857818576
auto str_endswith = [](const std::string & str, const std::string & suffix) {
1857918577
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
1858018578
};
@@ -18583,18 +18581,19 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
1858318581
if (str_endswith(name, ".lora_a")) {
1858418582
replace_all(name, ".lora_a", "");
1858518583
if (ab_map.find(name) == ab_map.end()) {
18586-
ab_map[name] = lora_weight(cur, nullptr);
18584+
ab_map[name] = llama_lora_weight(cur, nullptr);
1858718585
} else {
1858818586
ab_map[name].a = cur;
1858918587
}
1859018588
} else if (str_endswith(name, ".lora_b")) {
1859118589
replace_all(name, ".lora_b", "");
1859218590
if (ab_map.find(name) == ab_map.end()) {
18593-
ab_map[name] = lora_weight(nullptr, cur);
18591+
ab_map[name] = llama_lora_weight(nullptr, cur);
1859418592
} else {
1859518593
ab_map[name].b = cur;
1859618594
}
1859718595
} else {
18596+
// maybe "optimizer.*"" tensors
1859818597
LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name);
1859918598
}
1860018599
}
@@ -18603,28 +18602,26 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
1860318602
for (auto & it : ab_map) {
1860418603
std::string name = it.first;
1860518604
const char * cname = name.c_str();
18606-
lora_weight & w = it.second;
18605+
llama_lora_weight & w = it.second;
1860718606
GGML_ASSERT(w.a != nullptr);
1860818607
GGML_ASSERT(w.b != nullptr);
1860918608
int il = -1;
1861018609
sscanf(cname, "blk.%d.", &il);
18611-
struct ggml_context * dev_ctx; // device ctx
18612-
if (il >= 0) {
18613-
dev_ctx = ctx_map.at(model->buft_layer[il].buft);
18614-
} else if (strstr(cname, "tok") == 0) {
18615-
dev_ctx = ctx_map.at(model->buft_input.buft);
18616-
} else if (strstr(cname, "output") == 0) {
18617-
dev_ctx = ctx_map.at(model->buft_output.buft);
18618-
} else {
18619-
LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cname);
18620-
continue;
18610+
// device buft and device ctx
18611+
auto model_tensor = llama_get_model_tensor(model, cname);
18612+
if (!model_tensor) {
18613+
gguf_free(ctx_gguf);
18614+
ggml_free(ctx);
18615+
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model\n");
1862118616
}
18617+
struct ggml_context * dev_ctx = ctx_map.at(ggml_backend_buffer_get_type(model_tensor->buffer));
18618+
// TODO: validate tensor shape
1862218619
// LLAMA_LOG_INFO("%s %p %p\n", cname, w.a, w.b);
1862318620
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
1862418621
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
1862518622
ggml_set_name(tensor_a, w.a->name);
1862618623
ggml_set_name(tensor_b, w.b->name);
18627-
adapter.ab_map[name] = lora_weight(tensor_a, tensor_b);
18624+
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
1862818625
}
1862918626

1863018627
// allocate tensors / buffers and zero
@@ -18636,8 +18633,9 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
1863618633
ggml_context * ctx_dev = it.second;
1863718634
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft);
1863818635
if (!buf) {
18639-
LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora adapter\n", __func__);
18640-
return -1;
18636+
gguf_free(ctx_gguf);
18637+
ggml_free(ctx);
18638+
throw std::runtime_error("failed to allocate buffer for lora adapter\n");
1864118639
}
1864218640
ggml_backend_buffer_clear(buf, 0);
1864318641
adapter.ctxs.push_back(ctx_dev);
@@ -18671,14 +18669,18 @@ static int llama_lora_adapter_init_internal(const struct llama_model * model, co
1867118669
LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2);
1867218670

1867318671
// free ctx for reading gguf
18672+
gguf_free(ctx_gguf);
1867418673
ggml_free(ctx);
18675-
return 0;
1867618674
}
1867718675

1867818676
int32_t llama_lora_adapter_set(
1867918677
struct llama_context * ctx,
1868018678
struct llama_lora_adapter * adapter,
1868118679
float scale) {
18680+
if (ctx->cparams.flash_attn) {
18681+
LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__);
18682+
return -1;
18683+
}
1868218684
ctx->lora_adapters[adapter] = scale;
1868318685
return 0;
1868418686
}
@@ -19479,8 +19481,8 @@ uint32_t llama_model_quantize(
1947919481
struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
1948019482
try {
1948119483
struct llama_lora_adapter * adapter = new llama_lora_adapter(model);
19482-
int res = llama_lora_adapter_init_internal(model, path_lora, *adapter);
19483-
return res == 0 ? adapter : nullptr;
19484+
llama_lora_adapter_init_internal(model, path_lora, *adapter);
19485+
return adapter;
1948419486
} catch (const std::exception & err) {
1948519487
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
1948619488
return nullptr;

0 commit comments

Comments
 (0)