Skip to content

Commit 9841fbd

Browse files
committed
llama : lora fixes
1 parent f15167a commit 9841fbd

File tree

1 file changed

+35
-48
lines changed

1 file changed

+35
-48
lines changed

src/llama.cpp

Lines changed: 35 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -2831,7 +2831,7 @@ struct llama_context {
28312831
struct llama_lora_weight {
28322832
struct ggml_tensor * a = nullptr;
28332833
struct ggml_tensor * b = nullptr;
2834-
llama_lora_weight() {}
2834+
llama_lora_weight() = default;
28352835
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
28362836
};
28372837

@@ -18519,13 +18519,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1851918519
}
1852018520

1852118521
static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
18522-
static const int n_inp_tensors = 5; // see llama_model
18523-
static const int n_out_tensors = 5; // see llama_model
1852418522
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' ...\n", __func__, path_lora);
1852518523

1852618524
ggml_context * ctx = nullptr;
1852718525
struct gguf_init_params meta_gguf_params = {
18528-
/* .no_alloc = */ false,
18526+
/* .no_alloc = */ true,
1852918527
/* .ctx = */ &ctx,
1853018528
};
1853118529
struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
@@ -18536,58 +18534,43 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
1853618534
// check metadata
1853718535
{
1853818536
auto get_kv_str = [&](std::string key) -> std::string {
18539-
std::vector<char> str_buf(32, 0); // we only get the arch, so no need big buffer here
1854018537
int id = gguf_find_key(ctx_gguf, key.c_str());
1854118538
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
1854218539
};
1854318540
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
1854418541
auto lora_arch_name = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
1854518542
auto lora_arch = llm_arch_from_string(lora_arch_name);
1854618543
if (lora_arch != model->arch) {
18544+
gguf_free(ctx_gguf);
1854718545
throw std::runtime_error("model arch and LoRA arch mismatch");
1854818546
}
18547+
1854918548
auto train_type = get_kv_str(llm_kv(LLM_KV_TRAINING_TYPE));
1855018549
if (train_type != "finetune_lora") {
18550+
gguf_free(ctx_gguf);
1855118551
throw std::runtime_error("expect training.type to be finetune_lora, but got: " + train_type);
1855218552
}
1855318553
}
1855418554

18555-
// calculate n_tensors_per_layer
18556-
int n_tensors_per_layer = 0;
18557-
{
18558-
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
18559-
for (int i = 0; i < n_tensors; i++) {
18560-
int il = -1;
18561-
sscanf(gguf_get_tensor_name(ctx_gguf, i), "blk.%d.", &il);
18562-
if (il == 0) n_tensors_per_layer++;
18563-
}
18564-
}
18565-
18566-
// count layer buffer types
18567-
std::map<ggml_backend_buffer_type_t, int> buft_tensor_count;
18568-
for (int64_t i = 0; i < model->hparams.n_layer; i++) {
18569-
buft_tensor_count[model->buft_layer[i].buft] += n_tensors_per_layer;
18570-
}
18571-
buft_tensor_count[model->buft_input.buft] += n_inp_tensors;
18572-
buft_tensor_count[model->buft_output.buft] += n_out_tensors;
18555+
int n_tensors = gguf_get_n_tensors(ctx_gguf);
1857318556

18574-
// allocate contexts
18557+
// contexts for each buffer type
1857518558
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
18576-
{
18577-
auto new_ggml_ctx = [](size_t n_tensors) {
18559+
auto get_ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
18560+
auto it = ctx_map.find(buft);
18561+
if (it == ctx_map.end()) {
18562+
// add a new context
1857818563
struct ggml_init_params params = {
1857918564
/*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
1858018565
/*.mem_buffer =*/ NULL,
1858118566
/*.no_alloc =*/ true,
1858218567
};
18583-
return ggml_init(params);
18568+
ggml_context * buft_ctx = ggml_init(params);
18569+
ctx_map[buft] = buft_ctx;
18570+
return buft_ctx;
1858418571
};
18585-
for (auto & it : buft_tensor_count) {
18586-
int n_tensors = it.second;
18587-
// LLAMA_LOG_INFO("buf %p layers %d\n", it.first, it.second);
18588-
ctx_map[it.first] = new_ggml_ctx(2*n_tensors); // for a+b tensors
18589-
}
18590-
}
18572+
return it->second;
18573+
};
1859118574

1859218575
// bundle lora_a and lora_b into pairs
1859318576
std::map<std::string, llama_lora_weight> ab_map;
@@ -18611,33 +18594,40 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
1861118594
ab_map[name].b = cur;
1861218595
}
1861318596
} else {
18614-
// maybe "optimizer.*"" tensors
18615-
LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name);
18597+
gguf_free(ctx_gguf);
18598+
ggml_free(ctx);
18599+
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
1861618600
}
1861718601
}
1861818602

1861918603
// add tensors
1862018604
for (auto & it : ab_map) {
18621-
std::string name = it.first;
18622-
const char * cname = name.c_str();
18605+
const std::string & name = it.first;
1862318606
llama_lora_weight & w = it.second;
18624-
GGML_ASSERT(w.a != nullptr);
18625-
GGML_ASSERT(w.b != nullptr);
18626-
int il = -1;
18627-
sscanf(cname, "blk.%d.", &il);
18607+
18608+
if (!w.a || !w.b) {
18609+
gguf_free(ctx_gguf);
18610+
ggml_free(ctx);
18611+
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
18612+
}
18613+
1862818614
// device buft and device ctx
18629-
auto model_tensor = llama_get_model_tensor(model, cname);
18615+
auto * model_tensor = llama_get_model_tensor(model, name.c_str());
1863018616
if (!model_tensor) {
1863118617
gguf_free(ctx_gguf);
1863218618
ggml_free(ctx);
1863318619
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
1863418620
}
18635-
struct ggml_context * dev_ctx = ctx_map.at(ggml_backend_buffer_get_type(model_tensor->buffer));
18621+
struct ggml_context * dev_ctx = get_ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
1863618622
// validate tensor shape
1863718623
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
18624+
gguf_free(ctx_gguf);
18625+
ggml_free(ctx);
1863818626
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
1863918627
}
1864018628
if (w.a->ne[1] != w.b->ne[0]) {
18629+
gguf_free(ctx_gguf);
18630+
ggml_free(ctx);
1864118631
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
1864218632
}
1864318633
// save tensor to adapter
@@ -18661,7 +18651,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
1866118651
ggml_free(ctx);
1866218652
throw std::runtime_error("failed to allocate buffer for lora adapter\n");
1866318653
}
18664-
ggml_backend_buffer_clear(buf, 0);
18654+
LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
1866518655
adapter.ctxs.push_back(ctx_dev);
1866618656
adapter.bufs.push_back(buf);
1866718657
}
@@ -18674,12 +18664,9 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
1867418664
auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
1867518665
size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name));
1867618666
size_t size = ggml_nbytes(orig);
18677-
if (read_buf.size() < size) {
18678-
read_buf.resize(size);
18679-
}
18667+
read_buf.resize(size);
1868018668
gguf_file.seek(offs, SEEK_SET);
1868118669
gguf_file.read_raw(read_buf.data(), size);
18682-
// LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, dev->name, size);
1868318670
ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
1868418671
};
1868518672
for (auto & it : adapter.ab_map) {

0 commit comments

Comments
 (0)